From 85f6b932482c1eee4e6af7770b405b8c97b89daa Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 21 Oct 2022 20:23:10 -0500
Subject: [PATCH 01/51] Move passbuilder into core.

---
 numba_dpex/compiler.py               | 1 -
 numba_dpex/{ => core}/passbuilder.py | 0
 2 files changed, 1 deletion(-)
 rename numba_dpex/{ => core}/passbuilder.py (100%)

diff --git a/numba_dpex/compiler.py b/numba_dpex/compiler.py
index 09872b9870..c68c1f8191 100644
--- a/numba_dpex/compiler.py
+++ b/numba_dpex/compiler.py
@@ -34,7 +34,6 @@
 )
 
 from . import spirv_generator
-from .passbuilder import PassBuilder
 
 _RO_KERNEL_ARG = "read_only"
 _WO_KERNEL_ARG = "write_only"
diff --git a/numba_dpex/passbuilder.py b/numba_dpex/core/passbuilder.py
similarity index 100%
rename from numba_dpex/passbuilder.py
rename to numba_dpex/core/passbuilder.py

From 9dcbd7daea7aba56c6d34061183910c40a170243 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 22 Oct 2022 15:03:37 -0500
Subject: [PATCH 02/51] Add a compiler module into core.

   - The compiler module only contains the compiler pipeline
     to compiler SpirvKernel objects.
---
 numba_dpex/core/compiler.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 numba_dpex/core/compiler.py

diff --git a/numba_dpex/core/compiler.py b/numba_dpex/core/compiler.py
new file mode 100644
index 0000000000..bd5978d242
--- /dev/null
+++ b/numba_dpex/core/compiler.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from numba.core.compiler import CompilerBase, DefaultPassBuilder
+
+from numba_dpex.parfor_diagnostics import ExtendedParforDiagnostics
+
+from .passbuilder import PassBuilder
+
+
+class Compiler(CompilerBase):
+    """The DPEX compiler pipeline."""
+
+    def define_pipelines(self):
+        # this maintains the objmode fallback behaviour
+        pms = []
+        self.state.parfor_diagnostics = ExtendedParforDiagnostics()
+        self.state.metadata[
+            "parfor_diagnostics"
+        ] = self.state.parfor_diagnostics
+        if not self.state.flags.force_pyobject:
+            pms.append(PassBuilder.define_nopython_pipeline(self.state))
+        if self.state.status.can_fallback or self.state.flags.force_pyobject:
+            pms.append(
+                DefaultPassBuilder.define_objectmode_pipeline(self.state)
+            )
+        return pms

From 96e5e87fca4165a861ee756eb8bedfddc3194e6d Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 22 Oct 2022 15:09:00 -0500
Subject: [PATCH 03/51] Adds an arg_pack_unpack module to kernel_interface

   - Creates a separate module for the unpack and pack
     functions for kernel arguments.
   - The new API is intended for use from the Dispatcher class.
---
 numba_dpex/core/exceptions.py                 |  17 +-
 .../kernel_interface/arg_pack_unpacker.py     | 278 ++++++++++++++++++
 2 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 numba_dpex/core/kernel_interface/arg_pack_unpacker.py

diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index 35273fe981..c0880505cc 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""The module defines the custom exception classes used in numba_dpex.
+"""The module defines the custom error classes used in numba_dpex.
 """
 
 from warnings import warn
@@ -218,3 +218,18 @@ def __init__(self) -> None:
         else:
             self.message = "Unreachable code executed."
         super().__init__(self.message)
+
+
+class UnsupportedKernelArgumentError(Exception):
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
+
+
+class SUAIProtocolError(Exception):
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
+
+
+class UnsupportedAccessQualifierError(Exception):
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
diff --git a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
new file mode 100644
index 0000000000..ec1a533a87
--- /dev/null
+++ b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
@@ -0,0 +1,278 @@
+# SPDX-FileCopyrightText: 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ctypes
+import logging
+from multiprocessing.dummy import Array
+
+import dpctl.memory as dpctl_mem
+import numpy as np
+from numba.core import types
+
+import numba_dpex.utils as utils
+from numba_dpex.core.exceptions import (
+    SUAIProtocolError,
+    UnsupportedAccessQualifierError,
+    UnsupportedKernelArgumentError,
+)
+from numba_dpex.dpctl_iface import USMNdArrayType
+
+
+class Packer:
+
+    # TODO: Remove after NumPy support is removed
+    _access_types = ("read_only", "write_only", "read_write")
+
+    def _check_for_invalid_access_type(self, access_type):
+        if access_type not in Packer._access_types:
+            raise UnsupportedAccessQualifierError()
+        #     msg = (
+        #         "[!] %s is not a valid access type. "
+        #         "Supported access types are [" % (access_type)
+        #     )
+        #     for key in self.valid_access_types:
+        #         msg += " %s |" % (key)
+
+        #     msg = msg[:-1] + "]"
+        #     if access_type is not None:
+        #         print(msg)
+        #     return True
+        # else:
+        #     return False
+
+    def _get_info_from_suai(self, obj):
+        """
+        Extracts the metadata of an arrya-like object that provides a
+        __sycl_usm_array_interface__ (SUAI) attribute.
+
+        The ``dpctl.memory.as_usm_memory`` function converts the array-like
+        object into a dpctl.memory.USMMemory object. Using the ``as_usm_memory``
+        is an implicit way to verify if the array-like object is a legal
+        SYCL USM memory back Python object that can be passed to a dpex kernel.
+
+        Args:
+            obj: array-like object with a SUAI attribute.
+
+        Returns:
+            usm_mem: USM memory object.
+            total_size: Total number of items in the array.
+            shape: Shape of the array.
+            ndim: Total number of dimensions.
+            itemsize: Size of each item.
+            strides: Stride of the array.
+            dtype: Dtype of the array.
+        """
+        try:
+            usm_mem = dpctl_mem.as_usm_memory(obj)
+        except Exception:
+            logging.exception(
+                "array-like object does not implement the SUAI protocol."
+            )
+            # TODO
+            raise SUAIProtocolError()
+
+        shape = obj.__sycl_usm_array_interface__["shape"]
+        total_size = np.prod(obj.__sycl_usm_array_interface__["shape"])
+        ndim = len(obj.__sycl_usm_array_interface__["shape"])
+        itemsize = np.dtype(
+            obj.__sycl_usm_array_interface__["typestr"]
+        ).itemsize
+        dtype = np.dtype(obj.__sycl_usm_array_interface__["typestr"])
+        strides = obj.__sycl_usm_array_interface__["strides"]
+
+        if strides is None:
+            strides = [1] * ndim
+            for i in reversed(range(1, ndim)):
+                strides[i - 1] = strides[i] * shape[i]
+            strides = tuple(strides)
+
+        return usm_mem, total_size, shape, ndim, itemsize, strides, dtype
+
+    def _unpack_array_helper(self, size, itemsize, buf, shape, strides, ndim):
+        """
+        Implements the unpacking logic for array arguments.
+
+        TODO: Add more detail
+
+        Args:
+            size: Total number of elements in the array.
+            itemsize: Size in bytes of each element in the array.
+            buf: The pointer to the memory.
+            shape: The shape of the array.
+            ndim: Number of dimension.
+
+        Returns:
+            A list a ctype value for each array attribute argument
+        """
+        unpacked_array_attrs = []
+
+        # meminfo (FIXME: should be removed and the USMArrayType modified once
+        # NumPy support is removed)
+        unpacked_array_attrs.append(ctypes.c_size_t(0))
+        # meminfo (FIXME: Evaluate if the attribute should be removed and the
+        # USMArrayType modified once NumPy support is removed)
+        unpacked_array_attrs.append(ctypes.c_size_t(0))
+        unpacked_array_attrs.append(ctypes.c_longlong(size))
+        unpacked_array_attrs.append(ctypes.c_longlong(itemsize))
+        unpacked_array_attrs.append(buf)
+        for ax in range(ndim):
+            unpacked_array_attrs.append(ctypes.c_longlong(shape[ax]))
+        for ax in range(ndim):
+            unpacked_array_attrs.append(ctypes.c_longlong(strides[ax]))
+
+        return unpacked_array_attrs
+
+    def _unpack_usm_array(self, val):
+        (
+            usm_mem,
+            total_size,
+            shape,
+            ndim,
+            itemsize,
+            strides,
+            dtype,
+        ) = self._get_info_from_suai(val)
+
+        return self._unpack_array_helper(
+            total_size,
+            itemsize,
+            usm_mem,
+            shape,
+            strides,
+            ndim,
+        )
+
+    def _unpack_array(self, val, access_type):
+        packed_val = val
+        # Check if the NumPy array is backed by USM memory
+        usm_mem = utils.has_usm_memory(val)
+
+        # If the NumPy array is not USM backed, then copy to a USM memory
+        # object. Add an entry to the repack_map so that on exit from kernel
+        # the USM object can be copied back into the NumPy array.
+        if usm_mem is None:
+            self._check_for_invalid_access_type(access_type)
+            usm_mem = utils.as_usm_obj(val, queue=self._queue, copy=False)
+
+            orig_val = val
+            packed = False
+            if not val.flags.c_contiguous:
+                # If the numpy.ndarray is not C-contiguous
+                # we pack the strided array into a packed array.
+                # This allows us to treat the data from here on as C-contiguous.
+                # While packing we treat the data as C-contiguous.
+                # We store the reference of both (strided and packed)
+                # array and during unpacking we use numpy.copyto() to copy
+                # the data back from the packed temporary array to the
+                # original strided array.
+                packed_val = val.flatten(order="C")
+                packed = True
+
+            if access_type == "read_only":
+                utils.copy_from_numpy_to_usm_obj(usm_mem, packed_val)
+            elif access_type == "read_write":
+                utils.copy_from_numpy_to_usm_obj(usm_mem, packed_val)
+                # Store to the repack map
+                self._repack_map.update(
+                    {orig_val: (usm_mem, packed_val, packed)}
+                )
+            elif access_type == "write_only":
+                self._repack_map.update(
+                    {orig_val: (usm_mem, packed_val, packed)}
+                )
+
+        return self._unpack_array_helper(
+            packed_val.size,
+            packed_val.dtype.itemsize,
+            usm_mem,
+            packed_val.shape,
+            packed_val.strides,
+            packed_val.ndim,
+        )
+
+    def _unpack_argument(self, ty, val):
+        """
+        Unpack a Python object into a ctype value using Numba's
+        type-inference machinery.
+
+        Args:
+            ty: The data types of the kernel argument defined as in instance of
+            numba.types.
+            val: The value of the kernel argument.
+
+        Raises:
+            UnsupportedKernelArgumentError: When the argument is of an
+            unsupported type.
+
+        """
+
+        if isinstance(ty, USMNdArrayType):
+            return self._unpack_usm_array(val)
+        elif isinstance(ty, Array):
+            return self._unpack_array(val)
+        elif ty == types.int64:
+            return ctypes.c_longlong(val)
+        elif ty == types.uint64:
+            return ctypes.c_ulonglong(val)
+        elif ty == types.int32:
+            return ctypes.c_int(val)
+        elif ty == types.uint32:
+            return ctypes.c_uint(val)
+        elif ty == types.float64:
+            return ctypes.c_double(val)
+        elif ty == types.float32:
+            return ctypes.c_float(val)
+        elif ty == types.boolean:
+            return ctypes.c_uint8(int(val))
+        elif ty == types.complex64:
+            raise UnsupportedKernelArgumentError(ty, val)
+        elif ty == types.complex128:
+            raise UnsupportedKernelArgumentError(ty, val)
+        else:
+            raise UnsupportedKernelArgumentError(ty, val)
+
+    def _pack_array(self):
+        """
+        Copy device data back to host
+        """
+        for obj in self._repack_map.keys():
+
+            (usm_mem, packed_ndarr, packed) = self._repack_map[obj]
+            utils.copy_to_numpy_from_usm_obj(usm_mem, packed_ndarr)
+            if packed:
+                np.copyto(obj, packed_ndarr)
+
+    def __init__(self, arg_list, argty_list, queue) -> None:
+        """_summary_
+
+        Args:
+            arg_list (_type_): _description_
+            argty_list (_type_): _description_
+            queue: _description_
+        """
+        self._arg_list = arg_list
+        self._argty_list = argty_list
+        self._queue = queue
+
+        # loop over the arg_list and generate the kernelargs list
+        self._unpacked_args = []
+        for i, val in enumerate(arg_list):
+            arg = self._unpack_argument(ty=argty_list[i], val=val)
+            if type(arg) == list:
+                self._unpacked_args.extend(arg)
+            else:
+                self._unpacked_args.append(arg)
+
+        # Create a map for numpy arrays storing the unpacked information, as
+        # these arrays will need to be repacked.
+        self._repack_map = {}
+
+    @property
+    def unpacked_args(self):
+        return self._unpacked_args
+
+    @property
+    def repacked_args(self):
+        self._pack_array()
+        return self._repack_map.keys()

From 2939444dcb40129b5d3a49bdcf2d2c1f0728c8d3 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 25 Oct 2022 23:24:59 -0500
Subject: [PATCH 04/51] Change exception behavior

---
 numba_dpex/core/exceptions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index c0880505cc..d3a8d92100 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -112,8 +112,8 @@ def __init__(
         self, kernel_name, ndarray_argnum_list=None, *, usmarray_argnum_list
     ) -> None:
         if ndarray_argnum_list and usmarray_argnum_list:
-            ndarray_args = ",".join(ndarray_argnum_list)
-            usmarray_args = ",".join(usmarray_argnum_list)
+            ndarray_args = ",".join([str(i) for i in ndarray_argnum_list])
+            usmarray_args = ",".join([str(i) for i in usmarray_argnum_list])
             self.message = (
                 f'Kernel "{kernel_name}" has arguments of both usm_ndarray and '
                 "non-usm_ndarray types. Mixing of arguments of different "
@@ -122,7 +122,7 @@ def __init__(
                 f"and arguments {usmarray_args} are usm arrays."
             )
         elif usmarray_argnum_list:
-            usmarray_args = ",".join(usmarray_argnum_list)
+            usmarray_args = ",".join([str(i) for i in usmarray_argnum_list])
             self.message = (
                 f'Execution queue for kernel "{kernel_name}" could '
                 "be deduced using compute follows data programming model. The "

From f91e278eb79a673cc12d03f92fec2ef54a5412ba Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Mon, 17 Oct 2022 22:47:17 -0500
Subject: [PATCH 05/51] Refactored kernel dispatch API.

  - The concept of a kernel was decoupled from the notion of
    dispatching of a kernel. The present implementation in
    compiler.py intermixes both things, making hard the
    separation of compute-follows-data based kernel launch
    and legacy `dpctl.device_context` based behavior.
  - Deprecates support for numpy arrays as kernel args.
  - Deprecates support for the square bracket notation using
    `__getitem__` to provide global and local ranges for a
    kernel launch.
  - Changes the behavior of specializing a kernel using only
    a signature. The new way to specialize will require a
    device type and a backend.
  - Improvements to exception messages using custom exceptions.
  - The new API is now inside `numba_dpex.core.kernel_interface`.
---
 numba_dpex/core/kernel_interface/__init__.py  |   6 +
 .../core/kernel_interface/dispatcher.py       | 404 ++++++++++++++++++
 .../core/kernel_interface/kernel_base.py      |  60 +++
 .../core/kernel_interface/spirv_kernel.py     | 175 ++++++++
 4 files changed, 645 insertions(+)
 create mode 100644 numba_dpex/core/kernel_interface/__init__.py
 create mode 100644 numba_dpex/core/kernel_interface/dispatcher.py
 create mode 100644 numba_dpex/core/kernel_interface/kernel_base.py
 create mode 100644 numba_dpex/core/kernel_interface/spirv_kernel.py

diff --git a/numba_dpex/core/kernel_interface/__init__.py b/numba_dpex/core/kernel_interface/__init__.py
new file mode 100644
index 0000000000..5557024c06
--- /dev/null
+++ b/numba_dpex/core/kernel_interface/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Defines the Kernel classes that abstract a SYCL device kernel.
+"""
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
new file mode 100644
index 0000000000..1d26faaff4
--- /dev/null
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -0,0 +1,404 @@
+# SPDX-FileCopyrightText: 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+from warnings import warn
+
+import dpctl
+import dpctl.program as dpctl_prog
+from numba.core.types import Array as ArrayType
+
+from numba_dpex import config
+from numba_dpex.core.descriptor import dpex_target
+from numba_dpex.core.exceptions import (
+    ComputeFollowsDataInferenceError,
+    ExecutionQueueInferenceError,
+    InvalidKernelLaunchArgsError,
+    UnknownGlobalRangeError,
+    UnsupportedBackendError,
+)
+from numba_dpex.core.kernel_interface.arg_pack_unpacker import Packer
+from numba_dpex.core.kernel_interface.spirv_kernel import SpirvKernel
+from numba_dpex.dpctl_iface import USMNdArrayType
+
+
+class Dispatcher(object):
+    """Creates a Kernel object from a @kernel decorated function and enqueues
+    the Kernel object on a specified device.
+    """
+
+    # The list of SYCL backends supported by the Dispatcher
+    _supported_backends = ["opencl", "level_zero"]
+
+    def __init__(
+        self,
+        pyfunc,
+        debug_flags=None,
+        compile_flags=None,
+        array_access_specifiers=None,
+    ):
+        self.typingctx = dpex_target.typing_context
+        self.pyfunc = pyfunc
+        self.debug_flags = debug_flags
+        self.compile_flags = compile_flags
+        self.kernel_name = pyfunc.__name__
+        self._global_range = None
+        self._local_range = None
+
+        if array_access_specifiers:
+            warn(
+                "Access specifiers apply only to NumPy ndarrays. "
+                + "Support for NumPy ndarray objects as kernel arguments "
+                + "and access specifiers flags is deprecated. "
+                + "Use dpctl.tensor.usm_ndarray based arrays instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            self.array_access_specifiers = array_access_specifiers
+
+        if debug_flags or config.OPT == 0:
+            # if debug is ON we need to pass additional
+            # flags to igc.
+            self._create_sycl_kernel_bundle_flags = ["-g", "-cl-opt-disable"]
+        else:
+            self._create_sycl_kernel_bundle_flags = []
+
+    # def _ensure_valid_work_item_grid(self, val, sycl_queue):
+
+    #     if not isinstance(val, (tuple, list, int)):
+    #         error_message = (
+    #             "Cannot create work item dimension from provided argument"
+    #         )
+    #         raise ValueError(error_message)
+
+    #     if isinstance(val, int):
+    #         val = [val]
+
+    #     # TODO: we need some way to check the max dimensions
+    #     """
+    #     if len(val) > device_env.get_max_work_item_dims():
+    #         error_message = ("Unsupported number of work item dimensions ")
+    #         raise ValueError(error_message)
+    #     """
+
+    #     return list(
+    #         val[::-1]
+    #     )  # reversing due to sycl and opencl interop kernel range mismatch semantic
+
+    def _determine_compute_follows_data_queue(self, usm_array_list):
+        """Determine the execution queue for the list of usm array args using
+        compute follows data rules.
+
+        Uses ``dpctl.utils.get_execution_queue()`` to check if the list of
+        queues belonging to the usm_ndarrays are equivalent. If the queues are
+        equivalent, then returns the queue. If the queues are not equivalent
+        then returns None.
+
+        Args:
+            usm_array_list : A list of usm_ndarray objects
+
+        Returns:
+            A queue the common queue used to allocate the arrays. If no such
+            queue exists, then returns None.
+        """
+
+        queues = [usm_array.sycl_queue for usm_array in usm_array_list]
+        return dpctl.utils.get_execution_queue(queues)
+
+    def _determine_kernel_launch_queue(self, args, argtypes):
+        """Determines the queue where the kernel is to be launched.
+
+        The execution queue is derived using the following algorithm. In future,
+        support for ``numpy.ndarray`` and ``dpctl.device_context`` is to be
+        removed and queue derivation will follows Python Array API's
+        "compute follows data" logic.
+
+        Check if there are array arguments.
+        True:
+          Check if all array arguments are of type numpy.ndarray
+          (numba.types.Array)
+              True:
+                  Check if the kernel was invoked from within a
+                  dpctl.device_context.
+                  True:
+                      Provide a deprecation warning for device_context use and
+                      point to using dpctl.tensor.usm_ndarray or dpnp.ndarray
+
+                      return dpctl.get_current_queue
+                  False:
+                      Raise ExecutionQueueInferenceError
+              False:
+                  Check if all of the arrays are USMNdarray
+                      True:
+                          Check if execution queue could be inferred using
+                          compute follows data rules
+                          True:
+                              return the compute follows data inferred queue
+                          False:
+                              Raise ComputeFollowsDataInferenceError
+                      False:
+                          Raise ComputeFollowsDataInferenceError
+        False:
+          Check if the kernel was invoked from within a dpctl.device_context.
+            True:
+                Provide a deprecation warning for device_context use and
+                point to using dpctl.tensor.usm_ndarray of dpnp.ndarray
+
+                return dpctl.get_current_queue
+            False:
+                Raise ExecutionQueueInferenceError
+
+        Args:
+            args : A list of arguments passed to the kernel stored in the
+            launcher.
+            argtypes : The Numba inferred type for each argument.
+
+        Returns:
+            A queue the common queue used to allocate the arrays. If no such
+            queue exists, then raises an Exception.
+
+        Raises:
+            ComputeFollowsDataInferenceError: If the queue could not be inferred
+                using compute follows data rules.
+            ExecutionQueueInferenceError: If the queue could not be inferred
+                using the dpctl queue manager.
+        """
+        # Temporary workaround as USMNdArrayType derives from Array
+        array_argnums = [
+            i
+            for i, arg in enumerate(args)
+            if isinstance(argtypes[i], ArrayType)
+            and not isinstance(argtypes[i], USMNdArrayType)
+        ]
+        usmarray_argnums = [
+            i
+            for i, arg in enumerate(args)
+            if isinstance(argtypes[i], USMNdArrayType)
+        ]
+
+        # if usm and non-usm array arguments are getting mixed, then the
+        # execution queue cannot be inferred using compute follows data rules.
+        if array_argnums and usmarray_argnums:
+            raise ComputeFollowsDataInferenceError(
+                array_argnums, usmarray_argnum_list=usmarray_argnums
+            )
+        elif array_argnums and not usmarray_argnums:
+            if dpctl.is_in_device_context():
+                warn(
+                    "Support for dpctl.device_context to specify the "
+                    + "execution queue is deprecated. "
+                    + "Use dpctl.tensor.usm_ndarray based array "
+                    + "containers instead. ",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                warn(
+                    "Support for NumPy ndarray objects as kernel arguments is "
+                    + "deprecated. Use dpctl.tensor.usm_ndarray based array "
+                    + "containers instead. ",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                return dpctl.get_current_queue()
+            else:
+                raise ExecutionQueueInferenceError(self.kernel.name)
+        elif usmarray_argnums and not array_argnums:
+            if dpctl.is_in_device_context():
+                warn(
+                    "dpctl.device_context ignored as the kernel arguments "
+                    + "are dpctl.tensor.usm_ndarray based array containers."
+                )
+            usm_array_args = [
+                arg for i, arg in enumerate(args) if i in usmarray_argnums
+            ]
+            queue = self._determine_compute_follows_data_queue(usm_array_args)
+            if not queue:
+                raise ComputeFollowsDataInferenceError(
+                    self.kernel.name, usmarray_argnums
+                )
+            else:
+                return queue
+        else:
+            if dpctl.is_in_device_context():
+                warn(
+                    "Support for dpctl.device_context to specify the "
+                    + "execution queue is deprecated. "
+                    + "Use dpctl.tensor.usm_ndarray based array "
+                    + "containers instead. ",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                return dpctl.get_current_queue()
+            else:
+                raise ExecutionQueueInferenceError(self.kernel.name)
+
+    def __getitem__(self, args):
+        """Mimic's ``numba.cuda`` square-bracket notation for configuring the
+        global_range and local_range settings when launching a kernel on a
+        SYCL queue.
+
+        When a Python function decorated with the @kernel decorator,
+        is invoked it creates a KernelLauncher object. Calling the
+        KernelLauncher objects ``__getitem__`` function inturn clones the object
+        and sets the ``global_range`` and optionally the ``local_range``
+        attributes with the arguments passed to ``__getitem__``.
+
+        Args:
+            args (tuple): A tuple of tuples that specify the global and
+            optionally the local range for the kernel execution. If the
+            argument is a two-tuple of tuple, then it is assumed that both
+            global and local range options are specified. The first entry is
+            considered to be the global range and the second the local range.
+
+            If only a single tuple value is provided, then the kernel is
+            launched with only a global range and the local range configuration
+            is decided by the SYCL runtime.
+
+        Returns:
+            KernelLauncher: A clone of the KernelLauncher object, but with the
+            global_range and local_range attributes initialized.
+
+        .. deprecated:: 0.19
+            Use :func:`KernelLauncher.execute` instead.
+        """
+
+        warn(
+            "The [] (__getitem__) method to set global and local ranges for "
+            + "launching a kernel is deprecated. "
+            + "Use the execute function instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        nargs = len(args)
+
+        # Check if the kernel launch arguments are sane.
+        if nargs < 1:
+            raise UnknownGlobalRangeError(kernel_name=self.kernel_name)
+        elif nargs > 2:
+            raise InvalidKernelLaunchArgsError(
+                kernel_name=self.kernel_name, args=args
+            )
+
+        self.global_range = args[0]
+        if nargs == 2 and args[1] != []:
+            self.local_range = args[1]
+        else:
+            self.local_range = None
+
+        return copy.copy(self)
+
+    def _get_ranges(self, global_range, local_range):
+        """_summary_
+
+        Args:
+            global_range (_type_): _description_
+            local_range (_type_): _description_
+
+        Raises:
+            UnknownGlobalRangeError: _description_
+        """
+        if global_range:
+            if self._global_range:
+                warn(
+                    "Ignoring the previously set value of global_range and "
+                    + "using the value specified at the kernel call site."
+                )
+        else:
+            if self._global_range:
+                warn(
+                    "Use of __getitem__ to set the global_range attribute is "
+                    + 'deprecated. Use the keyword argument "global_range" of '
+                    + "__call__ method to set the attribute."
+                )
+                global_range = self._global_range
+
+            else:
+                raise UnknownGlobalRangeError(self.kernel_name)
+
+        if local_range:
+            if self._local_range:
+                warn(
+                    "Ignoring the previously set value of local_range and "
+                    + "using the value specified at the kernel call site.."
+                )
+        else:
+            if self._local_range:
+                warn(
+                    "Use of __getitem__ to set the local_range attribute is "
+                    + 'deprecated. Use the keyword argument "local_range" of '
+                    + "__call__ method to set the attribute."
+                )
+                local_range = self._local_range
+            else:
+                local_range = None
+                warn(
+                    "Kernel to be submitted without a local range letting "
+                    + "the SYCL runtime select a local range. The behavior "
+                    + "can lead to suboptimal performance in certain cases. "
+                    + "Consider setting the local range value for the kernel "
+                    + "execution.\n"
+                    + "The local_range keyword may be made a required argument "
+                    + "in the future."
+                )
+
+        # TODO: Test global and local ranges to be valid for the device
+
+        return (global_range, local_range)
+
+    def __call__(self, *args, global_range, local_range=None):
+        """_summary_
+
+        Args:
+            global_range (_type_): _description_
+            local_range (_type_): _description_.
+        """
+        argtypes = [self.typingctx.resolve_argument_type(arg) for arg in args]
+
+        # FIXME: Remove along with __getitem__
+        global_range, local_range = self._get_ranges(global_range, local_range)
+
+        exec_queue = self._determine_kernel_launch_queue(args, argtypes)
+        backend = exec_queue.backend
+
+        if exec_queue.backend not in [
+            dpctl.backend_type.opencl,
+            dpctl.backend_type.level_zero,
+        ]:
+            raise UnsupportedBackendError(
+                self.kernel_name, backend, Dispatcher._supported_backends
+            )
+
+        # TODO: Enable caching of kernels, but do it using Numba's caching
+        # machinery
+
+        kernel = SpirvKernel(self.pyfunc, self.kernel_name)
+        kernel.compile(
+            arg_types=argtypes,
+            debug=self.debug_flags,
+            extra_compile_flags=self.compile_flags,
+        )
+
+        # create a sycl::KernelBundle
+        kernel_bundle = dpctl_prog.create_program_from_spirv(
+            exec_queue,
+            kernel.device_driver_ir_module,
+            " ".join(self._create_sycl_kernel_bundle_flags),
+        )
+        #  get the sycl::kernel
+        kernel = kernel_bundle.get_sycl_kernel(kernel.module_name)
+
+        packer = Packer(arg_list=args, argty_list=argtypes, queue=exec_queue)
+
+        exec_queue.submit(
+            kernel,
+            packer.unpacked_args,
+            global_range,
+            local_range,
+        )
+
+        exec_queue.wait()
+
+        # TODO remove once NumPy support is removed
+        packer.repacked_args
diff --git a/numba_dpex/core/kernel_interface/kernel_base.py b/numba_dpex/core/kernel_interface/kernel_base.py
new file mode 100644
index 0000000000..12c32c2446
--- /dev/null
+++ b/numba_dpex/core/kernel_interface/kernel_base.py
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import abc
+
+
+class KernelInterface(metaclass=abc.ABCMeta):
+    """An interface for compute kernel that was generated either from a
+    Python function object or as a Numba IR FunctionType object.
+
+    Args:
+        metaclass (optional): The interface is derived from abc.ABCMeta.
+
+    Raises:
+        NotImplementedError: The interface does not implement any of the
+        methods and subclasses are required to implement them.
+    """
+
+    @classmethod
+    def __subclasshook__(cls, subclass):
+        return hasattr(
+            (subclass, "llvm_module")
+            and hasattr(subclass, "device_driver_ir_module")
+            and hasattr(subclass, "pyfunc_name")
+            and hasattr(subclass, "module_name")
+            and hasattr(subclass, "compile")
+            and callable(subclass.compile)
+        )
+
+    # TODO Add a property for argtypes
+
+    @property
+    @abc.abstractmethod
+    def llvm_module(self):
+        """The LLVM IR Module corresponding to the Kernel instance."""
+        raise NotImplementedError
+
+    @property
+    @abc.abstractmethod
+    def device_driver_ir_module(self):
+        """The module in a device IR (such as SPIR-V or PTX) format."""
+        raise NotImplementedError
+
+    @property
+    @abc.abstractmethod
+    def pyfunc_name(self):
+        """The Python function name corresponding to the Kernel instance."""
+        raise NotImplementedError
+
+    @property
+    @abc.abstractmethod
+    def module_name(self):
+        """The LLVM module name for the compiled kernel."""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def compile(self, target_ctx, args, debug, compile_flags):
+        """Abstract method to compile a Kernel instance."""
+        raise NotImplementedError
diff --git a/numba_dpex/core/kernel_interface/spirv_kernel.py b/numba_dpex/core/kernel_interface/spirv_kernel.py
new file mode 100644
index 0000000000..3acdc52ece
--- /dev/null
+++ b/numba_dpex/core/kernel_interface/spirv_kernel.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from types import FunctionType
+
+from numba.core import compiler, ir
+from numba.core import types as numba_types
+from numba.core.compiler_lock import global_compiler_lock
+
+from numba_dpex import compiler as dpex_compiler
+from numba_dpex import config, spirv_generator
+from numba_dpex.core.descriptor import dpex_target
+from numba_dpex.core.exceptions import (
+    KernelHasReturnValueError,
+    UncompiledKernelError,
+    UnreachableError,
+)
+
+from .kernel_base import KernelInterface
+
+
+class SpirvKernel(KernelInterface):
+    def __init__(self, func, pyfunc_name) -> None:
+        self._llvm_module = None
+        self._device_driver_ir_module = None
+        self._module_name = None
+        self._pyfunc_name = pyfunc_name
+        self._func = func
+        if isinstance(func, FunctionType):
+            self._func_ty = FunctionType
+        elif isinstance(func, ir.FunctionIR):
+            self._func_ty = ir.FunctionIR
+        else:
+            raise UnreachableError()
+
+    @global_compiler_lock
+    def _compile(self, pyfunc, args, debug=None, extra_compile_flags=None):
+        """
+        Compiles the function using the dpex compiler pipeline and returns the
+        compiled result.
+
+        Args:
+            pyfunc: The function to be compiled. Can be a Python function or a
+            Numba IR object representing a function.
+            args: The list of arguments passed to the kernel.
+            debug (bool): Optional flag to turn on debug mode compilation.
+            extra_compile_flags: Extra flags passed to the compiler.
+
+        Returns:
+            cres: Compiled result.
+
+        Raises:
+            KernelHasReturnValueError: If the compiled function returns a
+            non-void value.
+        """
+        # First compilation will trigger the initialization of the backend.
+        typingctx = dpex_target.typing_context
+        targetctx = dpex_target.target_context
+
+        flags = compiler.Flags()
+        # Do not compile the function to a binary, just lower to LLVM
+        flags.debuginfo = config.DEBUGINFO_DEFAULT
+        flags.no_compile = True
+        flags.no_cpython_wrapper = True
+        flags.nrt = False
+
+        if debug is not None:
+            flags.debuginfo = debug
+
+        # Run compilation pipeline
+        if isinstance(pyfunc, FunctionType):
+            cres = compiler.compile_extra(
+                typingctx=typingctx,
+                targetctx=targetctx,
+                func=pyfunc,
+                args=args,
+                return_type=None,
+                flags=flags,
+                locals={},
+                pipeline_class=dpex_compiler.Compiler,
+            )
+        elif isinstance(pyfunc, ir.FunctionIR):
+            cres = compiler.compile_ir(
+                typingctx=typingctx,
+                targetctx=targetctx,
+                func_ir=pyfunc,
+                args=args,
+                return_type=None,
+                flags=flags,
+                locals={},
+                pipeline_class=dpex_compiler.Compiler,
+            )
+        else:
+            raise UnreachableError()
+
+        if (
+            cres.signature.return_type is not None
+            and cres.signature.return_type != numba_types.void
+        ):
+            raise KernelHasReturnValueError(
+                kernel_name=pyfunc.__name__,
+                return_type=cres.signature.return_type,
+            )
+        # Linking depending libraries
+        library = cres.library
+        library.finalize()
+
+        return cres
+
+    @property
+    def llvm_module(self):
+        """The LLVM IR Module corresponding to the Kernel instance."""
+        if self._llvm_module:
+            return self._llvm_module
+        else:
+            raise UncompiledKernelError(self._pyfunc_name)
+
+    @property
+    def device_driver_ir_module(self):
+        """The module in a device IR (such as SPIR-V or PTX) format."""
+        if self._device_driver_ir_module:
+            return self._device_driver_ir_module
+        else:
+            raise UncompiledKernelError(self._pyfunc_name)
+
+    @property
+    def pyfunc_name(self):
+        """The Python function name corresponding to the kernel."""
+        return self._pyfunc_name
+
+    @property
+    def module_name(self):
+        """The name of the compiled LLVM module for the kernel."""
+        if self._module_name:
+            return self._module_name
+        else:
+            raise UncompiledKernelError(self._pyfunc_name)
+
+    def compile(self, arg_types, debug, extra_compile_flags):
+        """_summary_
+
+        Args:
+            arg_types (_type_): _description_
+            debug (_type_): _description_
+            extra_compile_flags (_type_): _description_
+        """
+
+        logging.debug("compiling SpirvKernel with arg types", arg_types)
+
+        cres = self._compile(
+            pyfunc=self._func,
+            args=arg_types,
+            debug=debug,
+            extra_compile_flags=extra_compile_flags,
+        )
+
+        self._target_context = cres.target_context
+
+        func = cres.library.get_function(cres.fndesc.llvm_func_name)
+        kernel = cres.target_context.prepare_ocl_kernel(
+            func, cres.signature.args
+        )
+        self._llvm_module = kernel.module.__str__()
+        self._module_name = kernel.name
+
+        # FIXME: There is no need to serialize the bitcode. It can be passed to
+        # llvm-spirv directly via stdin.
+
+        # FIXME: There is no need for spirv-dis. We cause use --to-text
+        # (or --spirv-text) to convert SPIRV to text
+        self._device_driver_ir_module = spirv_generator.llvm_to_spirv(
+            self._target_context, self._llvm_module, kernel.module.as_bitcode()
+        )

From 87b3d99c5ba17fc35369c6afdf47013bac107b82 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Mon, 24 Oct 2022 14:50:27 -0500
Subject: [PATCH 06/51] Temp commit to add a driver.py to run refactored code
 base.

---
 driver.py                                     | 36 +++++++++++++++++++
 .../core/kernel_interface/dispatcher.py       |  2 ++
 2 files changed, 38 insertions(+)
 create mode 100644 driver.py

diff --git a/driver.py b/driver.py
new file mode 100644
index 0000000000..fcc1461e80
--- /dev/null
+++ b/driver.py
@@ -0,0 +1,36 @@
+#! /usr/bin/env python
+
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+import dpctl.tensor as dpt
+
+import numba_dpex as dpex
+from numba_dpex.core.kernel_interface.dispatcher import Dispatcher
+
+
+# @dpex.kernel
+def data_parallel_sum(a, b, c):
+    """
+    Vector addition using the ``kernel`` decorator.
+    """
+    i = dpex.get_global_id(0)
+    c[i] = a[i] + b[i]
+
+
+def main():
+    a = dpt.arange(0, 100, device="level_zero:gpu:0")
+    b = dpt.arange(0, 100, device="level_zero:gpu:0")
+    c = dpt.zeros_like(a, device="level_zero:gpu:0")
+
+    d = Dispatcher(pyfunc=data_parallel_sum)
+    d(a, b, c, global_range=[100])
+    # data_parallel_sum[(10,), (1)](a, b, c)
+    print(dpt.asnumpy(a))
+    print(dpt.asnumpy(b))
+    print(dpt.asnumpy(c))
+    print("Done...")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 1d26faaff4..36b6b17927 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -43,6 +43,7 @@ def __init__(
         self.debug_flags = debug_flags
         self.compile_flags = compile_flags
         self.kernel_name = pyfunc.__name__
+        # To be removed
         self._global_range = None
         self._local_range = None
 
@@ -355,6 +356,7 @@ def __call__(self, *args, global_range, local_range=None):
             local_range (_type_): _description_.
         """
         argtypes = [self.typingctx.resolve_argument_type(arg) for arg in args]
+        breakpoint()
 
         # FIXME: Remove along with __getitem__
         global_range, local_range = self._get_ranges(global_range, local_range)

From 2b7267b57a8f348ccdecfc653dec6e4ab4d9ffce Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 27 Oct 2022 22:19:14 -0500
Subject: [PATCH 07/51] Improve exceptions.

---
 numba_dpex/core/exceptions.py                 | 53 ++++++++++++++++---
 .../kernel_interface/arg_pack_unpacker.py     | 36 +++++--------
 2 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index d3a8d92100..ca165034f4 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -221,15 +221,56 @@ def __init__(self) -> None:
 
 
 class UnsupportedKernelArgumentError(Exception):
-    def __init__(self, *args: object) -> None:
-        super().__init__(*args)
+    """Exception raised when the type of a kernel argument is not supported by
+    the compiler.
+
+    Args:
+        kernel_name (str): Name of kernel where the error was raised.
+    """
+
+    def __init__(self, type, value, kernel_name) -> None:
+        self.message = (
+            f'Argument {value} passed to kernel "{kernel_name}" is of an '
+            f"unsupported type ({type})."
+        )
+        super().__init__(self.message)
 
 
 class SUAIProtocolError(Exception):
-    def __init__(self, *args: object) -> None:
-        super().__init__(*args)
+    """Exception raised when an array-like object passed to a kernel is
+    neither a NumPy array nor does it implement the __sycl_usm_array_interface__
+    attribute.
+
+    Args:
+        kernel_name (str): Name of kernel where the error was raised.
+        arg: Array-like object
+    """
+
+    def __init__(self, kernel_name, arg) -> None:
+        self.message = (
+            f'Array-like argument {arg} passed to kernel "{kernel_name}" '
+            "is neither a NumPy array nor implement the "
+            "__sycl_usm_array_interface__."
+        )
+        super().__init__(self.message)
 
 
 class UnsupportedAccessQualifierError(Exception):
-    def __init__(self, *args: object) -> None:
-        super().__init__(*args)
+    """Exception raised when an illegal access specifier value is specified for an
+    NumPy array argument passed to a kernel.
+
+    Args:
+        kernel_name (str): Name of kernel where the error was raised.
+        array_val: name of the array argument with the illegal access specifier.
+        illegal_access_type (str): The illegal access specifier string.
+        legal_access_list (str): Joined string for the legal access specifiers.
+    """
+
+    def __init__(
+        self, kernel_name, array_val, illegal_access_type, legal_access_list
+    ) -> None:
+        self.message = f"Invalid access type {illegal_access_type} applied to "
+        f'array {array_val} argument passed to kernel "{kernel_name}". '
+        f"Legal access specifiers are {legal_access_list}."
+
+        super().__init__(self.message)
diff --git a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
index ec1a533a87..c50a5710cf 100644
--- a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
+++ b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
@@ -24,22 +24,14 @@ class Packer:
     # TODO: Remove after NumPy support is removed
     _access_types = ("read_only", "write_only", "read_write")
 
-    def _check_for_invalid_access_type(self, access_type):
+    def _check_for_invalid_access_type(self, array_val, access_type):
         if access_type not in Packer._access_types:
-            raise UnsupportedAccessQualifierError()
-        #     msg = (
-        #         "[!] %s is not a valid access type. "
-        #         "Supported access types are [" % (access_type)
-        #     )
-        #     for key in self.valid_access_types:
-        #         msg += " %s |" % (key)
-
-        #     msg = msg[:-1] + "]"
-        #     if access_type is not None:
-        #         print(msg)
-        #     return True
-        # else:
-        #     return False
+            raise UnsupportedAccessQualifierError(
+                self._pyfunc_name,
+                array_val,
+                access_type,
+                ",".join(Packer._access_types),
+            )
 
     def _get_info_from_suai(self, obj):
         """
@@ -69,8 +61,7 @@ def _get_info_from_suai(self, obj):
             logging.exception(
                 "array-like object does not implement the SUAI protocol."
             )
-            # TODO
-            raise SUAIProtocolError()
+            raise SUAIProtocolError(self._pyfunc_name, obj)
 
         shape = obj.__sycl_usm_array_interface__["shape"]
         total_size = np.prod(obj.__sycl_usm_array_interface__["shape"])
@@ -152,7 +143,7 @@ def _unpack_array(self, val, access_type):
         # object. Add an entry to the repack_map so that on exit from kernel
         # the USM object can be copied back into the NumPy array.
         if usm_mem is None:
-            self._check_for_invalid_access_type(access_type)
+            self._check_for_invalid_access_type(val, access_type)
             usm_mem = utils.as_usm_obj(val, queue=self._queue, copy=False)
 
             orig_val = val
@@ -226,11 +217,11 @@ def _unpack_argument(self, ty, val):
         elif ty == types.boolean:
             return ctypes.c_uint8(int(val))
         elif ty == types.complex64:
-            raise UnsupportedKernelArgumentError(ty, val)
+            raise UnsupportedKernelArgumentError(ty, val, self._pyfunc_name)
         elif ty == types.complex128:
-            raise UnsupportedKernelArgumentError(ty, val)
+            raise UnsupportedKernelArgumentError(ty, val, self._pyfunc_name)
         else:
-            raise UnsupportedKernelArgumentError(ty, val)
+            raise UnsupportedKernelArgumentError(ty, val, self._pyfunc_name)
 
     def _pack_array(self):
         """
@@ -243,7 +234,7 @@ def _pack_array(self):
             if packed:
                 np.copyto(obj, packed_ndarr)
 
-    def __init__(self, arg_list, argty_list, queue) -> None:
+    def __init__(self, kernel_name, arg_list, argty_list, queue) -> None:
         """_summary_
 
         Args:
@@ -251,6 +242,7 @@ def __init__(self, arg_list, argty_list, queue) -> None:
             argty_list (_type_): _description_
             queue: _description_
         """
+        self._pyfunc_name = kernel_name
         self._arg_list = arg_list
         self._argty_list = argty_list
         self._queue = queue

From 390da76feaf5c12a90d7ac7a637d5b5e4129e42d Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 3 Nov 2022 23:42:04 -0500
Subject: [PATCH 08/51] Update the temporary driver.

---
 driver.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/driver.py b/driver.py
index fcc1461e80..9f2b32ab60 100644
--- a/driver.py
+++ b/driver.py
@@ -9,7 +9,7 @@
 from numba_dpex.core.kernel_interface.dispatcher import Dispatcher
 
 
-# @dpex.kernel
+@dpex.kernel
 def data_parallel_sum(a, b, c):
     """
     Vector addition using the ``kernel`` decorator.
@@ -23,9 +23,9 @@ def main():
     b = dpt.arange(0, 100, device="level_zero:gpu:0")
     c = dpt.zeros_like(a, device="level_zero:gpu:0")
 
-    d = Dispatcher(pyfunc=data_parallel_sum)
-    d(a, b, c, global_range=[100])
-    # data_parallel_sum[(10,), (1)](a, b, c)
+    # d = Dispatcher(pyfunc=data_parallel_sum)
+    # d(a, b, c, global_range=[100])
+    data_parallel_sum[(100,)](a, b, c)
     print(dpt.asnumpy(a))
     print(dpt.asnumpy(b))
     print(dpt.asnumpy(c))

From d9b6ffa9cf4f8eeecdcd5c00b2642fb540b58036 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 4 Nov 2022 18:40:27 -0500
Subject: [PATCH 09/51] Added global range checks, kernel now uses dispatcher.

---
 numba_dpex/compiler.py                        |  15 ---
 numba_dpex/core/exceptions.py                 |  39 ++++++
 .../kernel_interface/arg_pack_unpacker.py     |  21 ++--
 .../core/kernel_interface/dispatcher.py       | 112 ++++++++++++------
 numba_dpex/decorators.py                      |  21 ++--
 5 files changed, 143 insertions(+), 65 deletions(-)

diff --git a/numba_dpex/compiler.py b/numba_dpex/compiler.py
index c68c1f8191..1e69df240f 100644
--- a/numba_dpex/compiler.py
+++ b/numba_dpex/compiler.py
@@ -62,21 +62,6 @@ def _raise_invalid_kernel_enqueue_args():
     raise ValueError(error_message)
 
 
-def get_ordered_arg_access_types(pyfunc, access_types):
-    # Construct a list of access type of each arg according to their position
-    ordered_arg_access_types = []
-    sig = signature(pyfunc, follow_wrapped=False)
-    for idx, arg_name in enumerate(sig.parameters):
-        if access_types:
-            for key in access_types:
-                if arg_name in access_types[key]:
-                    ordered_arg_access_types.append(key)
-        if len(ordered_arg_access_types) <= idx:
-            ordered_arg_access_types.append(None)
-
-    return ordered_arg_access_types
-
-
 class Compiler(CompilerBase):
     """The DPEX compiler pipeline."""
 
diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index ca165034f4..a552c6df1f 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -82,6 +82,45 @@ def __init__(self, kernel_name) -> None:
         super().__init__(self.message)
 
 
+class IllegalRangeValueError(Exception):
+    def __init__(self, kernel_name) -> None:
+        self.message = (
+            f"Kernel {kernel_name} cannot be dispatched with the "
+            "specified range. The range should be specified as a list, tuple, "
+            "or an int."
+        )
+        super().__init__(self.message)
+
+
+class UnsupportedNumberOfRangeDimsError(Exception):
+    def __init__(self, kernel_name, ndims, max_work_item_dims) -> None:
+        self.message = (
+            f"Specified range for kernel {kernel_name} has {ndims} dimensions, "
+            f"the device supports only {max_work_item_dims} dimensional "
+            "ranges."
+        )
+        super().__init__(self.message)
+
+
+class UnsupportedWorkItemSizeError(Exception):
+    """
+
+    Args:
+        Exception (_type_): _description_
+    """
+
+    def __init__(
+        self, kernel_name, dim, requested_work_items, supported_work_items
+    ) -> None:
+        self.message = (
+            f"Attempting to launch kernel {kernel_name} with "
+            f"{requested_work_items} work items in dimension {dim} is not "
+            f"supported. The device supports only {supported_work_items} "
+            f"work items for dimension {dim}."
+        )
+        super().__init__(self.message)
+
+
 class ComputeFollowsDataInferenceError(Exception):
     """Exception raised when an execution queue for a given array expression or
     a kernel function could not be deduced using the compute-follows-data
diff --git a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
index c50a5710cf..644d352d07 100644
--- a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
+++ b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
@@ -4,7 +4,6 @@
 
 import ctypes
 import logging
-from multiprocessing.dummy import Array
 
 import dpctl.memory as dpctl_mem
 import numpy as np
@@ -25,7 +24,7 @@ class Packer:
     _access_types = ("read_only", "write_only", "read_write")
 
     def _check_for_invalid_access_type(self, array_val, access_type):
-        if access_type not in Packer._access_types:
+        if access_type and access_type not in Packer._access_types:
             raise UnsupportedAccessQualifierError(
                 self._pyfunc_name,
                 array_val,
@@ -141,7 +140,7 @@ def _unpack_array(self, val, access_type):
 
         # If the NumPy array is not USM backed, then copy to a USM memory
         # object. Add an entry to the repack_map so that on exit from kernel
-        # the USM object can be copied back into the NumPy array.
+        # the data from the USM object can be copied back into the NumPy array.
         if usm_mem is None:
             self._check_for_invalid_access_type(val, access_type)
             usm_mem = utils.as_usm_obj(val, queue=self._queue, copy=False)
@@ -182,7 +181,7 @@ def _unpack_array(self, val, access_type):
             packed_val.ndim,
         )
 
-    def _unpack_argument(self, ty, val):
+    def _unpack_argument(self, ty, val, access_specifier):
         """
         Unpack a Python object into a ctype value using Numba's
         type-inference machinery.
@@ -200,8 +199,8 @@ def _unpack_argument(self, ty, val):
 
         if isinstance(ty, USMNdArrayType):
             return self._unpack_usm_array(val)
-        elif isinstance(ty, Array):
-            return self._unpack_array(val)
+        elif isinstance(ty, types.Array):
+            return self._unpack_array(val, access_specifier)
         elif ty == types.int64:
             return ctypes.c_longlong(val)
         elif ty == types.uint64:
@@ -234,7 +233,9 @@ def _pack_array(self):
             if packed:
                 np.copyto(obj, packed_ndarr)
 
-    def __init__(self, kernel_name, arg_list, argty_list, queue) -> None:
+    def __init__(
+        self, kernel_name, arg_list, argty_list, access_specifiers_list, queue
+    ) -> None:
         """_summary_
 
         Args:
@@ -250,7 +251,11 @@ def __init__(self, kernel_name, arg_list, argty_list, queue) -> None:
         # loop over the arg_list and generate the kernelargs list
         self._unpacked_args = []
         for i, val in enumerate(arg_list):
-            arg = self._unpack_argument(ty=argty_list[i], val=val)
+            arg = self._unpack_argument(
+                ty=argty_list[i],
+                val=val,
+                access_specifier=access_specifiers_list[i],
+            )
             if type(arg) == list:
                 self._unpacked_args.extend(arg)
             else:
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 36b6b17927..10880c8f3a 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import copy
+from inspect import signature
 from warnings import warn
 
 import dpctl
@@ -14,15 +15,33 @@
 from numba_dpex.core.exceptions import (
     ComputeFollowsDataInferenceError,
     ExecutionQueueInferenceError,
+    IllegalRangeValueError,
     InvalidKernelLaunchArgsError,
     UnknownGlobalRangeError,
     UnsupportedBackendError,
+    UnsupportedNumberOfRangeDimsError,
+    UnsupportedWorkItemSizeError,
 )
 from numba_dpex.core.kernel_interface.arg_pack_unpacker import Packer
 from numba_dpex.core.kernel_interface.spirv_kernel import SpirvKernel
 from numba_dpex.dpctl_iface import USMNdArrayType
 
 
+def get_ordered_arg_access_types(pyfunc, access_types):
+    # Construct a list of access type of each arg according to their position
+    ordered_arg_access_types = []
+    sig = signature(pyfunc, follow_wrapped=False)
+    for idx, arg_name in enumerate(sig.parameters):
+        if access_types:
+            for key in access_types:
+                if arg_name in access_types[key]:
+                    ordered_arg_access_types.append(key)
+        if len(ordered_arg_access_types) <= idx:
+            ordered_arg_access_types.append(None)
+
+    return ordered_arg_access_types
+
+
 class Dispatcher(object):
     """Creates a Kernel object from a @kernel decorated function and enqueues
     the Kernel object on a specified device.
@@ -43,7 +62,7 @@ def __init__(
         self.debug_flags = debug_flags
         self.compile_flags = compile_flags
         self.kernel_name = pyfunc.__name__
-        # To be removed
+        # TODO: To be removed once the__getitem__ is removed
         self._global_range = None
         self._local_range = None
 
@@ -56,7 +75,7 @@ def __init__(
                 DeprecationWarning,
                 stacklevel=2,
             )
-            self.array_access_specifiers = array_access_specifiers
+        self.array_access_specifiers = array_access_specifiers
 
         if debug_flags or config.OPT == 0:
             # if debug is ON we need to pass additional
@@ -65,27 +84,30 @@ def __init__(
         else:
             self._create_sycl_kernel_bundle_flags = []
 
-    # def _ensure_valid_work_item_grid(self, val, sycl_queue):
+    def _check_range(self, range, device):
 
-    #     if not isinstance(val, (tuple, list, int)):
-    #         error_message = (
-    #             "Cannot create work item dimension from provided argument"
-    #         )
-    #         raise ValueError(error_message)
+        if not isinstance(range, (tuple, list)):
+            raise IllegalRangeValueError(self.kernel_name)
 
-    #     if isinstance(val, int):
-    #         val = [val]
+        max_work_item_dims = device.max_work_item_dims
 
-    #     # TODO: we need some way to check the max dimensions
-    #     """
-    #     if len(val) > device_env.get_max_work_item_dims():
-    #         error_message = ("Unsupported number of work item dimensions ")
-    #         raise ValueError(error_message)
-    #     """
+        if len(range) > max_work_item_dims:
+            raise UnsupportedNumberOfRangeDimsError(
+                kernel_name=self.kernel_name,
+                ndims=len(range),
+                max_work_item_dims=max_work_item_dims,
+            )
 
-    #     return list(
-    #         val[::-1]
-    #     )  # reversing due to sycl and opencl interop kernel range mismatch semantic
+    def _check_ndrange(self, global_range, local_range, device):
+        # for dim, size in enumerate(val):
+        #     if val[dim] > work_item_sizes[dim]:
+        #         raise UnsupportedWorkItemSizeError(
+        #             kernel_name=self.kernel_name,
+        #             dim=dim,
+        #             requested_work_items=val[dim],
+        #             supported_work_items=work_item_sizes[dim],
+        #         )
+        pass
 
     def _determine_compute_follows_data_queue(self, usm_array_list):
         """Determine the execution queue for the list of usm array args using
@@ -273,7 +295,6 @@ def __getitem__(self, args):
         )
 
         nargs = len(args)
-
         # Check if the kernel launch arguments are sane.
         if nargs < 1:
             raise UnknownGlobalRangeError(kernel_name=self.kernel_name)
@@ -281,16 +302,15 @@ def __getitem__(self, args):
             raise InvalidKernelLaunchArgsError(
                 kernel_name=self.kernel_name, args=args
             )
-
-        self.global_range = args[0]
+        self._global_range = args[0]
         if nargs == 2 and args[1] != []:
-            self.local_range = args[1]
+            self._local_range = args[1]
         else:
-            self.local_range = None
+            self._local_range = None
 
         return copy.copy(self)
 
-    def _get_ranges(self, global_range, local_range):
+    def _get_ranges(self, global_range, local_range, device):
         """_summary_
 
         Args:
@@ -314,7 +334,6 @@ def _get_ranges(self, global_range, local_range):
                     + "__call__ method to set the attribute."
                 )
                 global_range = self._global_range
-
             else:
                 raise UnknownGlobalRangeError(self.kernel_name)
 
@@ -344,11 +363,31 @@ def _get_ranges(self, global_range, local_range):
                     + "in the future."
                 )
 
-        # TODO: Test global and local ranges to be valid for the device
+        if isinstance(global_range, int):
+            global_range = [global_range]
+
+        # If only global range value is provided, then the kernel is invoked
+        # over an N-dimensional index space defined by a SYCL range<N>, where
+        # N is one, two or three.
+        # If both local and global range values are specified the kernel is
+        # invoked using a SYCL nd_range
+        if global_range and not local_range:
+            self._check_range(global_range, device)
+            global_range = list(global_range)
+        else:
+            if isinstance(local_range, int):
+                local_range = [local_range]
+            self._check_ndrange(
+                global_range=global_range,
+                local_range=local_range,
+                device=device,
+            )
+            global_range = list(global_range)
+            local_range = list(local_range)
 
         return (global_range, local_range)
 
-    def __call__(self, *args, global_range, local_range=None):
+    def __call__(self, *args, global_range=None, local_range=None):
         """_summary_
 
         Args:
@@ -356,10 +395,6 @@ def __call__(self, *args, global_range, local_range=None):
             local_range (_type_): _description_.
         """
         argtypes = [self.typingctx.resolve_argument_type(arg) for arg in args]
-        breakpoint()
-
-        # FIXME: Remove along with __getitem__
-        global_range, local_range = self._get_ranges(global_range, local_range)
 
         exec_queue = self._determine_kernel_launch_queue(args, argtypes)
         backend = exec_queue.backend
@@ -372,6 +407,11 @@ def __call__(self, *args, global_range, local_range=None):
                 self.kernel_name, backend, Dispatcher._supported_backends
             )
 
+        # TODO: Refactor after __getitem__ is removed
+        global_range, local_range = self._get_ranges(
+            global_range, local_range, exec_queue.sycl_device
+        )
+
         # TODO: Enable caching of kernels, but do it using Numba's caching
         # machinery
 
@@ -391,7 +431,13 @@ def __call__(self, *args, global_range, local_range=None):
         #  get the sycl::kernel
         kernel = kernel_bundle.get_sycl_kernel(kernel.module_name)
 
-        packer = Packer(arg_list=args, argty_list=argtypes, queue=exec_queue)
+        packer = Packer(
+            kernel_name=self.kernel_name,
+            arg_list=args,
+            argty_list=argtypes,
+            queue=exec_queue,
+            access_specifiers_list=self.array_access_specifiers,
+        )
 
         exec_queue.submit(
             kernel,
diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py
index fdfe224df5..b936fed90c 100644
--- a/numba_dpex/decorators.py
+++ b/numba_dpex/decorators.py
@@ -6,14 +6,13 @@
 from numba.core import sigutils, types
 
 from numba_dpex.core.exceptions import KernelHasReturnValueError
-from numba_dpex.utils import npytypes_array_to_dpex_array
-
-from .compiler import (
-    JitKernel,
-    compile_func,
-    compile_func_template,
+from numba_dpex.core.kernel_interface.dispatcher import (
+    Dispatcher,
     get_ordered_arg_access_types,
 )
+from numba_dpex.utils import npytypes_array_to_dpex_array
+
+from .compiler import JitKernel, compile_func, compile_func_template
 
 
 def kernel(signature=None, access_types=None, debug=None):
@@ -37,13 +36,17 @@ def kernel(signature=None, access_types=None, debug=None):
 
 
 def autojit(debug=None, access_types=None):
-    def _kernel_autojit(pyfunc):
+    def _kernel_dispatcher(pyfunc):
         ordered_arg_access_types = get_ordered_arg_access_types(
             pyfunc, access_types
         )
-        return JitKernel(pyfunc, debug, ordered_arg_access_types)
+        return Dispatcher(
+            pyfunc=pyfunc,
+            debug_flags=debug,
+            array_access_specifiers=ordered_arg_access_types,
+        )
 
-    return _kernel_autojit
+    return _kernel_dispatcher
 
 
 def _kernel_jit(signature, debug, access_types):

From cd5d48ee72569ea2b78c0b35cbcf60ce5aea5974 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 8 Nov 2022 13:51:35 -0600
Subject: [PATCH 10/51] Fix the pack/repack of Numpy arrays

---
 .../kernel_interface/arg_pack_unpacker.py     | 46 +++++++++++++------
 .../core/kernel_interface/dispatcher.py       |  8 ++--
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
index 644d352d07..088c7c3c9a 100644
--- a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
+++ b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
@@ -18,6 +18,14 @@
 from numba_dpex.dpctl_iface import USMNdArrayType
 
 
+class _NumPyArrayPackerPayload:
+    def __init__(self, usm_mem, orig_val, packed_val, packed) -> None:
+        self._usm_mem = usm_mem
+        self._orig_val = orig_val
+        self._packed_val = packed_val
+        self._packed = packed
+
+
 class Packer:
 
     # TODO: Remove after NumPy support is removed
@@ -164,12 +172,23 @@ def _unpack_array(self, val, access_type):
             elif access_type == "read_write":
                 utils.copy_from_numpy_to_usm_obj(usm_mem, packed_val)
                 # Store to the repack map
-                self._repack_map.update(
-                    {orig_val: (usm_mem, packed_val, packed)}
+                self._repack_list.append(
+                    _NumPyArrayPackerPayload(
+                        usm_mem, orig_val, packed_val, packed
+                    )
                 )
             elif access_type == "write_only":
-                self._repack_map.update(
-                    {orig_val: (usm_mem, packed_val, packed)}
+                self._repack_list.append(
+                    _NumPyArrayPackerPayload(
+                        usm_mem, orig_val, packed_val, packed
+                    )
+                )
+            else:
+                utils.copy_from_numpy_to_usm_obj(usm_mem, packed_val)
+                self._repack_list.append(
+                    _NumPyArrayPackerPayload(
+                        usm_mem, orig_val, packed_val, packed
+                    )
                 )
 
         return self._unpack_array_helper(
@@ -226,12 +245,10 @@ def _pack_array(self):
         """
         Copy device data back to host
         """
-        for obj in self._repack_map.keys():
-
-            (usm_mem, packed_ndarr, packed) = self._repack_map[obj]
-            utils.copy_to_numpy_from_usm_obj(usm_mem, packed_ndarr)
-            if packed:
-                np.copyto(obj, packed_ndarr)
+        for obj in self._repack_list:
+            utils.copy_to_numpy_from_usm_obj(obj._usm_mem, obj._packed_val)
+            if obj._packed:
+                np.copyto(obj.orig_val, obj._packed_val)
 
     def __init__(
         self, kernel_name, arg_list, argty_list, access_specifiers_list, queue
@@ -247,6 +264,9 @@ def __init__(
         self._arg_list = arg_list
         self._argty_list = argty_list
         self._queue = queue
+        # Create a list to store the numpy arrays that need to be
+        # repacked beoe returning from a kernel.
+        self._repack_list = []
 
         # loop over the arg_list and generate the kernelargs list
         self._unpacked_args = []
@@ -261,10 +281,6 @@ def __init__(
             else:
                 self._unpacked_args.append(arg)
 
-        # Create a map for numpy arrays storing the unpacked information, as
-        # these arrays will need to be repacked.
-        self._repack_map = {}
-
     @property
     def unpacked_args(self):
         return self._unpacked_args
@@ -272,4 +288,4 @@ def unpacked_args(self):
     @property
     def repacked_args(self):
         self._pack_array()
-        return self._repack_map.keys()
+        return self._repack_list
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 10880c8f3a..c6028c3a0b 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -373,7 +373,9 @@ def _get_ranges(self, global_range, local_range, device):
         # invoked using a SYCL nd_range
         if global_range and not local_range:
             self._check_range(global_range, device)
-            global_range = list(global_range)
+            # FIXME:[::-1] is done as OpenCL and SYCl have different orders when it
+            # comes to specifying dimensions.
+            global_range = list(global_range)[::-1]
         else:
             if isinstance(local_range, int):
                 local_range = [local_range]
@@ -382,8 +384,8 @@ def _get_ranges(self, global_range, local_range, device):
                 local_range=local_range,
                 device=device,
             )
-            global_range = list(global_range)
-            local_range = list(local_range)
+            global_range = list(global_range)[::-1]
+            local_range = list(local_range)[::-1]
 
         return (global_range, local_range)
 

From c4a474923ced1f7dbabf1db50e432a792bc5d859 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 8 Nov 2022 15:29:21 -0600
Subject: [PATCH 11/51] Update existing compute follows data unit test.

---
 numba_dpex/core/kernel_interface/dispatcher.py   |  6 +++---
 .../kernel_tests/test_compute_follows_data.py    | 16 ++++------------
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index c6028c3a0b..3411867dac 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -225,7 +225,7 @@ def _determine_kernel_launch_queue(self, args, argtypes):
                 )
                 return dpctl.get_current_queue()
             else:
-                raise ExecutionQueueInferenceError(self.kernel.name)
+                raise ExecutionQueueInferenceError(self.kernel_name)
         elif usmarray_argnums and not array_argnums:
             if dpctl.is_in_device_context():
                 warn(
@@ -238,7 +238,7 @@ def _determine_kernel_launch_queue(self, args, argtypes):
             queue = self._determine_compute_follows_data_queue(usm_array_args)
             if not queue:
                 raise ComputeFollowsDataInferenceError(
-                    self.kernel.name, usmarray_argnums
+                    self.kernel_name, usmarray_argnum_list=usmarray_argnums
                 )
             else:
                 return queue
@@ -254,7 +254,7 @@ def _determine_kernel_launch_queue(self, args, argtypes):
                 )
                 return dpctl.get_current_queue()
             else:
-                raise ExecutionQueueInferenceError(self.kernel.name)
+                raise ExecutionQueueInferenceError(self.kernel_name)
 
     def __getitem__(self, args):
         """Mimic's ``numba.cuda`` square-bracket notation for configuring the
diff --git a/numba_dpex/tests/kernel_tests/test_compute_follows_data.py b/numba_dpex/tests/kernel_tests/test_compute_follows_data.py
index ad16d9c921..91f233a3ff 100644
--- a/numba_dpex/tests/kernel_tests/test_compute_follows_data.py
+++ b/numba_dpex/tests/kernel_tests/test_compute_follows_data.py
@@ -10,17 +10,12 @@
 import pytest
 
 import numba_dpex
+from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError
 from numba_dpex.tests._helper import (
     filter_strings,
     skip_no_level_zero_gpu,
     skip_no_opencl_gpu,
 )
-from numba_dpex.utils import (
-    IndeterminateExecutionQueueError,
-    IndeterminateExecutionQueueError_msg,
-    cfd_ctx_mgr_wrng_msg,
-    mix_datatype_err_msg,
-)
 
 global_size = 10
 local_size = 1
@@ -115,7 +110,7 @@ def test_ndarray_argtype(offload_device, input_arrays):
 def test_mix_argtype(offload_device, input_arrays):
     usm_type = "device"
 
-    a, b, expected = input_arrays
+    a, b, _ = input_arrays
     got = np.ones_like(a)
 
     device = dpctl.SyclDevice(offload_device)
@@ -136,11 +131,9 @@ def test_mix_argtype(offload_device, input_arrays):
         buffer_ctor_kwargs={"queue": queue},
     )
 
-    with pytest.raises(TypeError) as error_msg:
+    with pytest.raises(ComputeFollowsDataInferenceError):
         sum_kernel[global_size, local_size](da, b, dc)
 
-        assert mix_datatype_err_msg in error_msg
-
 
 @pytest.mark.parametrize("offload_device", filter_strings)
 def test_context_manager_with_usm_ndarray(offload_device, input_arrays):
@@ -235,9 +228,8 @@ def test_equivalent_usm_ndarray(input_arrays):
         buffer_ctor_kwargs={"queue": queue1},
     )
 
-    with pytest.raises(IndeterminateExecutionQueueError) as error_msg:
+    with pytest.raises(ComputeFollowsDataInferenceError):
         sum_kernel[global_size, local_size](da, not_equivalent_db, dc)
-        assert IndeterminateExecutionQueueError_msg in str(error_msg.value)
 
     sum_kernel[global_size, local_size](da, equivalent_db, dc)
     dc.usm_data.copy_to_host(got.reshape((-1)).view("|u1"))

From 8e0403ca8b875443875f39f042181aad2dafc0a2 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 8 Nov 2022 15:49:07 -0600
Subject: [PATCH 12/51] Fix failing test_sycl_usm_array_iface_interop tests.

---
 numba_dpex/core/kernel_interface/dispatcher.py       | 12 +++++++++---
 .../integration/test_sycl_usm_array_iface_interop.py |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 3411867dac..9cf8fc3e09 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -17,6 +17,7 @@
     ExecutionQueueInferenceError,
     IllegalRangeValueError,
     InvalidKernelLaunchArgsError,
+    SUAIProtocolError,
     UnknownGlobalRangeError,
     UnsupportedBackendError,
     UnsupportedNumberOfRangeDimsError,
@@ -111,7 +112,7 @@ def _check_ndrange(self, global_range, local_range, device):
 
     def _determine_compute_follows_data_queue(self, usm_array_list):
         """Determine the execution queue for the list of usm array args using
-        compute follows data rules.
+        compute follows data programming model.
 
         Uses ``dpctl.utils.get_execution_queue()`` to check if the list of
         queues belonging to the usm_ndarrays are equivalent. If the queues are
@@ -125,8 +126,13 @@ def _determine_compute_follows_data_queue(self, usm_array_list):
             A queue the common queue used to allocate the arrays. If no such
             queue exists, then returns None.
         """
-
-        queues = [usm_array.sycl_queue for usm_array in usm_array_list]
+        queues = []
+        for usm_array in usm_array_list:
+            try:
+                q = usm_array.__sycl_usm_array_interface__["syclobj"]
+                queues.append(q)
+            except:
+                raise SUAIProtocolError(self.kernel_name, usm_array)
         return dpctl.utils.get_execution_queue(queues)
 
     def _determine_kernel_launch_queue(self, args, argtypes):
diff --git a/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py b/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py
index 143b992059..2a127715c7 100644
--- a/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py
+++ b/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py
@@ -33,7 +33,7 @@ def test_kernel_valid_usm_obj(dtype):
     """Test if a ``numba_dpex.kernel`` function accepts a DuckUSMArray argument.
 
     The ``DuckUSMArray`` uses ``dpctl.memory`` to allocate a Python object that
-    defines a __sycl_usm_array__interface__ attribute. We test if
+    defines a ``__sycl_usm_array__interface__`` attribute. We test if
     ``numba_dpex`` recognizes the ``DuckUSMArray`` as a valid USM-backed Python
     object and accepts it as a kernel argument.
 

From 3c997b8a1563680b46604a3cae9f67918a8e84dd Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 8 Nov 2022 17:24:34 -0600
Subject: [PATCH 13/51] Rewrite test to use refactored API.

---
 .../tests/kernel_tests/test_atomic_op.py      | 37 ++++++++++---------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/numba_dpex/tests/kernel_tests/test_atomic_op.py b/numba_dpex/tests/kernel_tests/test_atomic_op.py
index 1d568027d5..fd036fed10 100644
--- a/numba_dpex/tests/kernel_tests/test_atomic_op.py
+++ b/numba_dpex/tests/kernel_tests/test_atomic_op.py
@@ -176,7 +176,6 @@ def skip_if_disabled(*args):
     return pytest.param(*args, marks=skip_NATIVE_FP_ATOMICS_0)
 
 
-@pytest.mark.parametrize("filter_str", filter_strings)
 @skip_no_atomic_support
 @pytest.mark.parametrize(
     "NATIVE_FP_ATOMICS, expected_native_atomic_for_device",
@@ -197,7 +196,6 @@ def skip_if_disabled(*args):
 )
 @pytest.mark.parametrize("dtype", list_of_f_dtypes)
 def test_atomic_fp_native(
-    filter_str,
     NATIVE_FP_ATOMICS,
     expected_native_atomic_for_device,
     function_generator,
@@ -206,20 +204,25 @@ def test_atomic_fp_native(
     dtype,
 ):
     function = function_generator(operator_name, dtype)
-    kernel = dpex.kernel(function)
-    argtypes = kernel._get_argtypes(np.array([0], dtype))
+    kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel(
+        function, function.__name__
+    )
+    args = [np.array([0], dtype)]
+    argtypes = [
+        dpex.core.descriptor.dpex_target.typing_context.resolve_argument_type(
+            arg
+        )
+        for arg in args
+    ]
 
     with override_config("NATIVE_FP_ATOMICS", NATIVE_FP_ATOMICS):
-
-        with dpctl.device_context(filter_str) as sycl_queue:
-
-            specialized_kernel = kernel[
-                global_size, dpex.DEFAULT_LOCAL_SIZE
-            ].specialize(argtypes, sycl_queue)
-
-            is_native_atomic = (
-                expected_spirv_function in specialized_kernel.assembly
-            )
-            assert is_native_atomic == expected_native_atomic_for_device(
-                filter_str
-            )
+        kernel.compile(
+            arg_types=argtypes,
+            debug=None,
+            extra_compile_flags=None,
+        )
+
+        is_native_atomic = expected_spirv_function in kernel._llvm_module
+        assert is_native_atomic == expected_native_atomic_for_device(
+            dpctl.select_default_device().filter_string
+        )

From 782069e0e2908bb7f77d814a489e4d92f4beaffa Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 8 Nov 2022 20:01:21 -0600
Subject: [PATCH 14/51] Update tests to check DI tag generation.

---
 numba_dpex/tests/test_debuginfo.py | 48 ++++++++++--------------------
 1 file changed, 16 insertions(+), 32 deletions(-)

diff --git a/numba_dpex/tests/test_debuginfo.py b/numba_dpex/tests/test_debuginfo.py
index 3c9a92f655..7fc5cc1e64 100644
--- a/numba_dpex/tests/test_debuginfo.py
+++ b/numba_dpex/tests/test_debuginfo.py
@@ -23,11 +23,16 @@ def debug_option(request):
     return request.param
 
 
-def get_kernel_ir(sycl_queue, fn, sig, debug=None):
-    kernel = compiler.compile_kernel(
-        sycl_queue, fn.py_func, sig, None, debug=debug
+def get_kernel_ir(fn, sig, debug=None):
+    kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel(
+        fn, fn.__name__
     )
-    return kernel.assembly
+    kernel.compile(
+        arg_types=sig,
+        debug=debug,
+        extra_compile_flags=None,
+    )
+    return kernel.llvm_module
 
 
 def make_check(ir, val_to_search):
@@ -45,15 +50,11 @@ def test_debug_flag_generates_ir_with_debuginfo(debug_option):
     Check debug info is emitting to IR if debug parameter is set to True
     """
 
-    @dpex.kernel
     def foo(x):
         x = 1  # noqa
 
-    sycl_queue = dpctl.get_current_queue()
     sig = (types.int32,)
-
-    kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=debug_option)
-
+    kernel_ir = get_kernel_ir(foo, sig, debug=debug_option)
     tag = "!dbg"
 
     if debug_option:
@@ -68,7 +69,6 @@ def test_debug_info_locals_vars_on_no_opt():
     if debug parameter is set to True and optimization is O0
     """
 
-    @dpex.kernel
     def foo(var_a, var_b, var_c):
         i = dpex.get_global_id(0)
         var_c[i] = var_a[i] + var_b[i]
@@ -79,8 +79,6 @@ def foo(var_a, var_b, var_c):
         '!DILocalVariable(name: "var_c"',
         '!DILocalVariable(name: "i"',
     ]
-
-    sycl_queue = dpctl.get_current_queue()
     sig = (
         npytypes_array_to_dpex_array(types.float32[:]),
         npytypes_array_to_dpex_array(types.float32[:]),
@@ -88,7 +86,7 @@ def foo(var_a, var_b, var_c):
     )
 
     with override_config("OPT", 0):
-        kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=True)
+        kernel_ir = get_kernel_ir(foo, sig, debug=True)
 
     for tag in ir_tags:
         assert tag in kernel_ir
@@ -100,7 +98,6 @@ def test_debug_kernel_local_vars_in_ir():
     created in kernel
     """
 
-    @dpex.kernel
     def foo(arr):
         index = dpex.get_global_id(0)
         local_d = 9 * 99 + 5
@@ -110,11 +107,8 @@ def foo(arr):
         '!DILocalVariable(name: "index"',
         '!DILocalVariable(name: "local_d"',
     ]
-
-    sycl_queue = dpctl.get_current_queue()
     sig = (npytypes_array_to_dpex_array(types.float32[:]),)
-
-    kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=True)
+    kernel_ir = get_kernel_ir(foo, sig, debug=True)
 
     for tag in ir_tags:
         assert tag in kernel_ir
@@ -130,7 +124,6 @@ def func_sum(a, b):
         result = a + b
         return result
 
-    @dpex.kernel(debug=debug_option)
     def data_parallel_sum(a, b, c):
         i = dpex.get_global_id(0)
         c[i] = func_sum(a[i], b[i])
@@ -140,16 +133,13 @@ def data_parallel_sum(a, b, c):
         r'\!DISubprogram\(name: ".*data_parallel_sum"',
     ]
 
-    sycl_queue = dpctl.get_current_queue()
     sig = (
         npytypes_array_to_dpex_array(types.float32[:]),
         npytypes_array_to_dpex_array(types.float32[:]),
         npytypes_array_to_dpex_array(types.float32[:]),
     )
 
-    kernel_ir = get_kernel_ir(
-        sycl_queue, data_parallel_sum, sig, debug=debug_option
-    )
+    kernel_ir = get_kernel_ir(data_parallel_sum, sig, debug=debug_option)
 
     for tag in ir_tags:
         assert debug_option == make_check(kernel_ir, tag)
@@ -165,7 +155,6 @@ def func_sum(a, b):
         result = a + b
         return result
 
-    @dpex.kernel
     def data_parallel_sum(a, b, c):
         i = dpex.get_global_id(0)
         c[i] = func_sum(a[i], b[i])
@@ -175,7 +164,6 @@ def data_parallel_sum(a, b, c):
         r'\!DISubprogram\(name: ".*data_parallel_sum"',
     ]
 
-    sycl_queue = dpctl.get_current_queue()
     sig = (
         npytypes_array_to_dpex_array(types.float32[:]),
         npytypes_array_to_dpex_array(types.float32[:]),
@@ -183,14 +171,13 @@ def data_parallel_sum(a, b, c):
     )
 
     with override_config("DEBUGINFO_DEFAULT", int(debug_option)):
-        kernel_ir = get_kernel_ir(sycl_queue, data_parallel_sum, sig)
+        kernel_ir = get_kernel_ir(data_parallel_sum, sig)
 
     for tag in ir_tags:
         assert debug_option == make_check(kernel_ir, tag)
 
 
 def test_debuginfo_DISubprogram_linkageName():
-    @dpex.kernel
     def func(a, b):
         i = dpex.get_global_id(0)
         b[i] = a[i]
@@ -199,20 +186,18 @@ def func(a, b):
         r'\!DISubprogram\(.*linkageName: ".*e4func.*"',
     ]
 
-    sycl_queue = dpctl.get_current_queue()
     sig = (
         npytypes_array_to_dpex_array(types.float32[:]),
         npytypes_array_to_dpex_array(types.float32[:]),
     )
 
-    kernel_ir = get_kernel_ir(sycl_queue, func, sig, debug=True)
+    kernel_ir = get_kernel_ir(func, sig, debug=True)
 
     for tag in ir_tags:
         assert make_check(kernel_ir, tag)
 
 
 def test_debuginfo_DICompileUnit_language_and_producer():
-    @dpex.kernel
     def func(a, b):
         i = dpex.get_global_id(0)
         b[i] = a[i]
@@ -222,13 +207,12 @@ def func(a, b):
         r'\!DICompileUnit\(.*producer: "numba-dpex"',
     ]
 
-    sycl_queue = dpctl.get_current_queue()
     sig = (
         npytypes_array_to_dpex_array(types.float32[:]),
         npytypes_array_to_dpex_array(types.float32[:]),
     )
 
-    kernel_ir = get_kernel_ir(sycl_queue, func, sig, debug=True)
+    kernel_ir = get_kernel_ir(func, sig, debug=True)
 
     for tag in ir_tags:
         assert make_check(kernel_ir, tag)

From bce61d2ee2a3e87a3efda8f565cda8147ef51f61 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 8 Nov 2022 21:15:53 -0600
Subject: [PATCH 15/51] Fix to address failing unit test for strided numpy
 array support.

---
 numba_dpex/core/kernel_interface/arg_pack_unpacker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
index 088c7c3c9a..0c63dfa1e5 100644
--- a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
+++ b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
@@ -248,7 +248,7 @@ def _pack_array(self):
         for obj in self._repack_list:
             utils.copy_to_numpy_from_usm_obj(obj._usm_mem, obj._packed_val)
             if obj._packed:
-                np.copyto(obj.orig_val, obj._packed_val)
+                np.copyto(obj._orig_val, obj._packed_val)
 
     def __init__(
         self, kernel_name, arg_list, argty_list, access_specifiers_list, queue

From c19353304552be11fda83bd330fcb554efcfa414 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 19 Nov 2022 21:18:00 -0600
Subject: [PATCH 16/51] move pass pipelines into compiler.py

---
 numba_dpex/core/compiler.py   | 210 ++++++++++++++++++++++++++++++++--
 numba_dpex/core/exceptions.py |  12 +-
 2 files changed, 210 insertions(+), 12 deletions(-)

diff --git a/numba_dpex/core/compiler.py b/numba_dpex/core/compiler.py
index bd5978d242..05cf64dbc2 100644
--- a/numba_dpex/core/compiler.py
+++ b/numba_dpex/core/compiler.py
@@ -2,27 +2,217 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from numba.core.compiler import CompilerBase, DefaultPassBuilder
+from numba.core.compiler import CompilerBase
+from numba.core.compiler_machinery import PassManager
+from numba.core.typed_passes import (
+    AnnotateTypes,
+    InlineOverloads,
+    IRLegalization,
+    NopythonRewrites,
+    NoPythonSupportedFeatureValidation,
+    NopythonTypeInference,
+    PreLowerStripPhis,
+)
+from numba.core.untyped_passes import (
+    DeadBranchPrune,
+    FindLiterallyCalls,
+    FixupArgs,
+    GenericRewrites,
+    InlineClosureLikes,
+    InlineInlinables,
+    IRProcessing,
+    LiteralPropagationSubPipelinePass,
+    LiteralUnroll,
+    MakeFunctionToJitFunction,
+    ReconstructSSA,
+    RewriteSemanticConstants,
+    TranslateByteCode,
+    WithLifting,
+)
 
+from numba_dpex.core.exceptions import UnsupportedCompilationModeError
+from numba_dpex.core.passes.passes import (
+    ConstantSizeStaticLocalMemoryPass,
+    DpexLowering,
+    DumpParforDiagnostics,
+    NoPythonBackend,
+    ParforPass,
+    PreParforPass,
+)
+from numba_dpex.core.passes.rename_numpy_functions_pass import (
+    RewriteNdarrayFunctionsPass,
+    RewriteOverloadedNumPyFunctionsPass,
+)
 from numba_dpex.parfor_diagnostics import ExtendedParforDiagnostics
 
-from .passbuilder import PassBuilder
+
+class PassBuilder(object):
+    """
+    A pass builder to run dpex's code-generation and optimization passes.
+
+    Unlike Numba, dpex's pass builder does not offer objectmode and
+    interpreted passes.
+    """
+
+    @staticmethod
+    def define_untyped_pipeline(state, name="dpex_untyped"):
+        """Returns an untyped part of the nopython pipeline
+
+        The pipeline of untyped passes is duplicated from Numba's compiler. We
+        are adding couple of passes to the pipeline to change specific numpy
+        overloads.
+        """
+        pm = PassManager(name)
+        if state.func_ir is None:
+            pm.add_pass(TranslateByteCode, "analyzing bytecode")
+            pm.add_pass(FixupArgs, "fix up args")
+        pm.add_pass(IRProcessing, "processing IR")
+        pm.add_pass(WithLifting, "Handle with contexts")
+
+        # --- Begin dpex passes added to the untyped pipeline                --#
+
+        # The RewriteOverloadedNumPyFunctionsPass rewrites the module namespace
+        # of specific NumPy functions to dpnp, as we overload these functions
+        # differently.
+        pm.add_pass(
+            RewriteOverloadedNumPyFunctionsPass,
+            "Rewrite name of Numpy functions to overload already overloaded "
+            + "function",
+        )
+        # Add pass to ensure when users allocate static constant memory the
+        # size of the allocation is a constant and not specified by a closure
+        # variable.
+        pm.add_pass(
+            ConstantSizeStaticLocalMemoryPass,
+            "dpex constant size for static local memory",
+        )
+
+        # --- End of dpex passes added to the untyped pipeline               --#
+
+        # inline closures early in case they are using nonlocal's
+        # see issue #6585.
+        pm.add_pass(
+            InlineClosureLikes, "inline calls to locally defined closures"
+        )
+
+        # pre typing
+        if not state.flags.no_rewrites:
+            pm.add_pass(RewriteSemanticConstants, "rewrite semantic constants")
+            pm.add_pass(DeadBranchPrune, "dead branch pruning")
+            pm.add_pass(GenericRewrites, "nopython rewrites")
+
+        # convert any remaining closures into functions
+        pm.add_pass(
+            MakeFunctionToJitFunction,
+            "convert make_function into JIT functions",
+        )
+        # inline functions that have been determined as inlinable and rerun
+        # branch pruning, this needs to be run after closures are inlined as
+        # the IR repr of a closure masks call sites if an inlinable is called
+        # inside a closure
+        pm.add_pass(InlineInlinables, "inline inlinable functions")
+        if not state.flags.no_rewrites:
+            pm.add_pass(DeadBranchPrune, "dead branch pruning")
+
+        pm.add_pass(FindLiterallyCalls, "find literally calls")
+        pm.add_pass(LiteralUnroll, "handles literal_unroll")
+
+        if state.flags.enable_ssa:
+            pm.add_pass(ReconstructSSA, "ssa")
+
+        pm.add_pass(LiteralPropagationSubPipelinePass, "Literal propagation")
+
+        pm.finalize()
+        return pm
+
+    @staticmethod
+    def define_typed_pipeline(state, name="dpex_typed"):
+        """Returns the typed part of the nopython pipeline"""
+        pm = PassManager(name)
+        # typing
+        pm.add_pass(NopythonTypeInference, "nopython frontend")
+
+        pm.add_pass(
+            RewriteNdarrayFunctionsPass,
+            "Rewrite numpy.ndarray functions to dpnp.ndarray functions",
+        )
+
+        # strip phis
+        pm.add_pass(PreLowerStripPhis, "remove phis nodes")
+
+        # optimization
+        pm.add_pass(InlineOverloads, "inline overloaded functions")
+        pm.add_pass(PreParforPass, "Preprocessing for parfors")
+        if not state.flags.no_rewrites:
+            pm.add_pass(NopythonRewrites, "nopython rewrites")
+        pm.add_pass(ParforPass, "convert to parfors")
+
+        pm.finalize()
+        return pm
+
+    @staticmethod
+    def define_nopython_lowering_pipeline(state, name="dpex_nopython_lowering"):
+        """Returns an nopython mode pipeline based PassManager"""
+        pm = PassManager(name)
+
+        # legalize
+        pm.add_pass(
+            NoPythonSupportedFeatureValidation,
+            "ensure features that are in use are in a valid form",
+        )
+        pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
+
+        # Annotate only once legalized
+        pm.add_pass(AnnotateTypes, "annotate types")
+
+        # lower
+        pm.add_pass(DpexLowering, "Custom Lowerer with auto-offload support")
+        pm.add_pass(NoPythonBackend, "nopython mode backend")
+        pm.add_pass(DumpParforDiagnostics, "dump parfor diagnostics")
+
+        pm.finalize()
+        return pm
+
+    @staticmethod
+    def define_nopython_pipeline(state, name="dpex_nopython"):
+        """Returns an nopython mode pipeline based PassManager"""
+        # compose pipeline from untyped, typed and lowering parts
+        dpb = PassBuilder
+        pm = PassManager(name)
+        untyped_passes = dpb.define_untyped_pipeline(state)
+        pm.passes.extend(untyped_passes.passes)
+
+        typed_passes = dpb.define_typed_pipeline(state)
+        pm.passes.extend(typed_passes.passes)
+
+        lowering_passes = dpb.define_nopython_lowering_pipeline(state)
+        pm.passes.extend(lowering_passes.passes)
+
+        pm.finalize()
+        return pm
 
 
 class Compiler(CompilerBase):
-    """The DPEX compiler pipeline."""
+    """Dpex's compiler pipeline."""
 
     def define_pipelines(self):
-        # this maintains the objmode fallback behaviour
-        pms = []
+        dpb = PassBuilder
+        pm = PassManager("dpex")
+
         self.state.parfor_diagnostics = ExtendedParforDiagnostics()
         self.state.metadata[
             "parfor_diagnostics"
         ] = self.state.parfor_diagnostics
+
+        passes = dpb.define_nopython_pipeline(self.state)
+        pm.passes.extend(passes.passes)
+
         if not self.state.flags.force_pyobject:
-            pms.append(PassBuilder.define_nopython_pipeline(self.state))
+            pm.extend(PassBuilder.define_nopython_pipeline(self.state))
+
         if self.state.status.can_fallback or self.state.flags.force_pyobject:
-            pms.append(
-                DefaultPassBuilder.define_objectmode_pipeline(self.state)
-            )
-        return pms
+            raise UnsupportedCompilationModeError()
+
+        pm.finalize()
+
+        return [pm]
diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index a552c6df1f..11098aae33 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -295,8 +295,8 @@ def __init__(self, kernel_name, arg) -> None:
 
 
 class UnsupportedAccessQualifierError(Exception):
-    """Exception raised when an illegal access specifier value is specified for an
-    NumPy array argument passed to a kernel.
+    """Exception raised when an illegal access specifier value is specified for
+    a NumPy array argument passed to a kernel.
 
     Args:
         kernel_name (str): Name of kernel where the error was raised.
@@ -313,3 +313,11 @@ def __init__(
         f"Legal access specifiers are {legal_access_list}."
 
         super().__init__(self.message)
+
+
+class UnsupportedCompilationModeError(Exception):
+    def __init__(self) -> None:
+        self.message = (
+            'The dpex compiler does not support the "force_pyobject" setting.'
+        )
+        super().__init__(self.message)

From fb677dbdd609922e9877a698d0c09f604b03b0f7 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 19 Nov 2022 21:33:36 -0600
Subject: [PATCH 17/51] Incorporate review comments.

---
 .../core/kernel_interface/spirv_kernel.py     | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/numba_dpex/core/kernel_interface/spirv_kernel.py b/numba_dpex/core/kernel_interface/spirv_kernel.py
index 3acdc52ece..091d098480 100644
--- a/numba_dpex/core/kernel_interface/spirv_kernel.py
+++ b/numba_dpex/core/kernel_interface/spirv_kernel.py
@@ -22,11 +22,22 @@
 
 
 class SpirvKernel(KernelInterface):
-    def __init__(self, func, pyfunc_name) -> None:
+    def __init__(self, func, func_name) -> None:
+        """Represents a SPIR-V module compiled for a Python function.
+
+        Args:
+            func: The function to be compiled. Can be a Python function or a
+            Numba IR object representing a function.
+            func_name (str): Name of the function being compiled
+
+        Raises:
+            UnreachableError: An internal error indicating an unexpected code
+            path was executed.
+        """
         self._llvm_module = None
         self._device_driver_ir_module = None
         self._module_name = None
-        self._pyfunc_name = pyfunc_name
+        self._pyfunc_name = func_name
         self._func = func
         if isinstance(func, FunctionType):
             self._func_ty = FunctionType
@@ -36,14 +47,12 @@ def __init__(self, func, pyfunc_name) -> None:
             raise UnreachableError()
 
     @global_compiler_lock
-    def _compile(self, pyfunc, args, debug=None, extra_compile_flags=None):
+    def _compile(self, args, debug=None, extra_compile_flags=None):
         """
         Compiles the function using the dpex compiler pipeline and returns the
         compiled result.
 
         Args:
-            pyfunc: The function to be compiled. Can be a Python function or a
-            Numba IR object representing a function.
             args: The list of arguments passed to the kernel.
             debug (bool): Optional flag to turn on debug mode compilation.
             extra_compile_flags: Extra flags passed to the compiler.
@@ -70,22 +79,22 @@ def _compile(self, pyfunc, args, debug=None, extra_compile_flags=None):
             flags.debuginfo = debug
 
         # Run compilation pipeline
-        if isinstance(pyfunc, FunctionType):
+        if isinstance(self._func, FunctionType):
             cres = compiler.compile_extra(
                 typingctx=typingctx,
                 targetctx=targetctx,
-                func=pyfunc,
+                func=self._func,
                 args=args,
                 return_type=None,
                 flags=flags,
                 locals={},
                 pipeline_class=dpex_compiler.Compiler,
             )
-        elif isinstance(pyfunc, ir.FunctionIR):
+        elif isinstance(self._func, ir.FunctionIR):
             cres = compiler.compile_ir(
                 typingctx=typingctx,
                 targetctx=targetctx,
-                func_ir=pyfunc,
+                func_ir=self._func,
                 args=args,
                 return_type=None,
                 flags=flags,
@@ -100,7 +109,7 @@ def _compile(self, pyfunc, args, debug=None, extra_compile_flags=None):
             and cres.signature.return_type != numba_types.void
         ):
             raise KernelHasReturnValueError(
-                kernel_name=pyfunc.__name__,
+                kernel_name=self._pyfunc_name,
                 return_type=cres.signature.return_type,
             )
         # Linking depending libraries
@@ -150,7 +159,6 @@ def compile(self, arg_types, debug, extra_compile_flags):
         logging.debug("compiling SpirvKernel with arg types", arg_types)
 
         cres = self._compile(
-            pyfunc=self._func,
             args=arg_types,
             debug=debug,
             extra_compile_flags=extra_compile_flags,

From 43bb3da578ef8266ab21063febf79b2e004a0cd7 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Mon, 21 Nov 2022 17:22:42 -0600
Subject: [PATCH 18/51] Port func decorator to new API.

---
 numba_dpex/core/_compile_helper.py            |  99 +++++++++++++++++
 numba_dpex/core/compiler.py                   |  23 ++--
 numba_dpex/core/kernel_interface/func.py      | 102 ++++++++++++++++++
 .../core/kernel_interface/spirv_kernel.py     |  95 ++--------------
 numba_dpex/decorators.py                      |  18 ++--
 5 files changed, 227 insertions(+), 110 deletions(-)
 create mode 100644 numba_dpex/core/_compile_helper.py
 create mode 100644 numba_dpex/core/kernel_interface/func.py

diff --git a/numba_dpex/core/_compile_helper.py b/numba_dpex/core/_compile_helper.py
new file mode 100644
index 0000000000..d0aa0b30ee
--- /dev/null
+++ b/numba_dpex/core/_compile_helper.py
@@ -0,0 +1,99 @@
+# SPDX-FileCopyrightText: 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from types import FunctionType
+
+from numba.core import compiler, ir
+from numba.core import types as numba_types
+from numba.core.compiler_lock import global_compiler_lock
+
+from numba_dpex import config
+from numba_dpex.core import compiler as dpex_compiler
+from numba_dpex.core.descriptor import dpex_target
+from numba_dpex.core.exceptions import (
+    KernelHasReturnValueError,
+    UnreachableError,
+)
+
+
+@global_compiler_lock
+def compile_with_dpex(
+    pyfunc,
+    pyfunc_name,
+    args,
+    return_type,
+    debug=None,
+    is_kernel=True,
+    extra_compile_flags=None,
+):
+    """
+    Compiles the function using the dpex compiler pipeline and returns the
+    compiled result.
+
+    Args:
+        args: The list of arguments passed to the kernel.
+        debug (bool): Optional flag to turn on debug mode compilation.
+        extra_compile_flags: Extra flags passed to the compiler.
+
+    Returns:
+        cres: Compiled result.
+
+    Raises:
+        KernelHasReturnValueError: If the compiled function returns a
+        non-void value.
+    """
+    # First compilation will trigger the initialization of the backend.
+    typingctx = dpex_target.typing_context
+    targetctx = dpex_target.target_context
+
+    flags = compiler.Flags()
+    # Do not compile the function to a binary, just lower to LLVM
+    flags.debuginfo = config.DEBUGINFO_DEFAULT
+    flags.no_compile = True
+    flags.no_cpython_wrapper = True
+    flags.nrt = False
+
+    if debug is not None:
+        flags.debuginfo = debug
+
+    # Run compilation pipeline
+    if isinstance(pyfunc, FunctionType):
+        cres = compiler.compile_extra(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=dpex_compiler.Compiler,
+        )
+    elif isinstance(pyfunc, ir.FunctionIR):
+        cres = compiler.compile_ir(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func_ir=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=dpex_compiler.Compiler,
+        )
+    else:
+        raise UnreachableError()
+
+    if (
+        is_kernel
+        and cres.signature.return_type is not None
+        and cres.signature.return_type != numba_types.void
+    ):
+        raise KernelHasReturnValueError(
+            kernel_name=pyfunc_name,
+            return_type=cres.signature.return_type,
+        )
+    # Linking depending libraries
+    library = cres.library
+    library.finalize()
+
+    return cres
diff --git a/numba_dpex/core/compiler.py b/numba_dpex/core/compiler.py
index 05cf64dbc2..df5d87aef4 100644
--- a/numba_dpex/core/compiler.py
+++ b/numba_dpex/core/compiler.py
@@ -131,7 +131,8 @@ def define_typed_pipeline(state, name="dpex_typed"):
         pm = PassManager(name)
         # typing
         pm.add_pass(NopythonTypeInference, "nopython frontend")
-
+        # Annotate only once legalized
+        pm.add_pass(AnnotateTypes, "annotate types")
         pm.add_pass(
             RewriteNdarrayFunctionsPass,
             "Rewrite numpy.ndarray functions to dpnp.ndarray functions",
@@ -162,9 +163,6 @@ def define_nopython_lowering_pipeline(state, name="dpex_nopython_lowering"):
         )
         pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
 
-        # Annotate only once legalized
-        pm.add_pass(AnnotateTypes, "annotate types")
-
         # lower
         pm.add_pass(DpexLowering, "Custom Lowerer with auto-offload support")
         pm.add_pass(NoPythonBackend, "nopython mode backend")
@@ -196,23 +194,14 @@ class Compiler(CompilerBase):
     """Dpex's compiler pipeline."""
 
     def define_pipelines(self):
-        dpb = PassBuilder
-        pm = PassManager("dpex")
-
+        # this maintains the objmode fallback behaviour
+        pms = []
         self.state.parfor_diagnostics = ExtendedParforDiagnostics()
         self.state.metadata[
             "parfor_diagnostics"
         ] = self.state.parfor_diagnostics
-
-        passes = dpb.define_nopython_pipeline(self.state)
-        pm.passes.extend(passes.passes)
-
         if not self.state.flags.force_pyobject:
-            pm.extend(PassBuilder.define_nopython_pipeline(self.state))
-
+            pms.append(PassBuilder.define_nopython_pipeline(self.state))
         if self.state.status.can_fallback or self.state.flags.force_pyobject:
             raise UnsupportedCompilationModeError()
-
-        pm.finalize()
-
-        return [pm]
+        return pms
diff --git a/numba_dpex/core/kernel_interface/func.py b/numba_dpex/core/kernel_interface/func.py
new file mode 100644
index 0000000000..c0c24789fe
--- /dev/null
+++ b/numba_dpex/core/kernel_interface/func.py
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""_summary_
+"""
+
+
+from numba.core.typing.templates import AbstractTemplate, ConcreteTemplate
+
+from numba_dpex.core._compile_helper import compile_with_dpex
+
+
+def compile_func(pyfunc, return_type, args, debug=None):
+    cres = compile_with_dpex(
+        pyfunc=pyfunc,
+        pyfunc_name=pyfunc.__name__,
+        return_type=return_type,
+        args=args,
+        is_kernel=False,
+        debug=debug,
+    )
+    func = cres.library.get_function(cres.fndesc.llvm_func_name)
+    cres.target_context.mark_ocl_device(func)
+    devfn = DpexFunction(cres)
+
+    class _function_template(ConcreteTemplate):
+        key = devfn
+        cases = [cres.signature]
+
+    cres.typing_context.insert_user_function(devfn, _function_template)
+    libs = [cres.library]
+    cres.target_context.insert_user_function(devfn, cres.fndesc, libs)
+    return devfn
+
+
+def compile_func_template(pyfunc, debug=None):
+    """Compile a DpexFunctionTemplate"""
+    from numba_dpex.core.descriptor import dpex_target
+
+    dft = DpexFunctionTemplate(pyfunc, debug=debug)
+
+    class _function_template(AbstractTemplate):
+        key = dft
+
+        def generic(self, args, kws):
+            if kws:
+                raise AssertionError("No keyword arguments allowed.")
+            return dft.compile(args)
+
+    typingctx = dpex_target.typing_context
+    typingctx.insert_user_function(dft, _function_template)
+    return dft
+
+
+class DpexFunctionTemplate(object):
+    """Unmaterialized dpex function"""
+
+    def __init__(self, pyfunc, debug=None):
+        self.py_func = pyfunc
+        self.debug = debug
+        self._compileinfos = {}
+
+    def compile(self, args):
+        """Compile a dpex.func decorated Python function with the given
+        argument types.
+
+        Each signature is compiled once by caching the compiled function inside
+        this object.
+        """
+        if args not in self._compileinfos:
+            cres = compile_with_dpex(
+                pyfunc=self.py_func,
+                pyfunc_name=self.py_func.__name__,
+                return_type=None,
+                args=args,
+                is_kernel=False,
+                debug=self.debug,
+            )
+            func = cres.library.get_function(cres.fndesc.llvm_func_name)
+            cres.target_context.mark_ocl_device(func)
+            first_definition = not self._compileinfos
+            self._compileinfos[args] = cres
+            libs = [cres.library]
+
+            if first_definition:
+                # First definition
+                cres.target_context.insert_user_function(
+                    self, cres.fndesc, libs
+                )
+            else:
+                cres.target_context.add_user_function(self, cres.fndesc, libs)
+
+        else:
+            cres = self._compileinfos[args]
+
+        return cres.signature
+
+
+class DpexFunction(object):
+    def __init__(self, cres):
+        self.cres = cres
diff --git a/numba_dpex/core/kernel_interface/spirv_kernel.py b/numba_dpex/core/kernel_interface/spirv_kernel.py
index 091d098480..efbf438da4 100644
--- a/numba_dpex/core/kernel_interface/spirv_kernel.py
+++ b/numba_dpex/core/kernel_interface/spirv_kernel.py
@@ -5,18 +5,11 @@
 import logging
 from types import FunctionType
 
-from numba.core import compiler, ir
-from numba.core import types as numba_types
-from numba.core.compiler_lock import global_compiler_lock
-
-from numba_dpex import compiler as dpex_compiler
-from numba_dpex import config, spirv_generator
-from numba_dpex.core.descriptor import dpex_target
-from numba_dpex.core.exceptions import (
-    KernelHasReturnValueError,
-    UncompiledKernelError,
-    UnreachableError,
-)
+from numba.core import ir
+
+from numba_dpex import spirv_generator
+from numba_dpex.core import _compile_helper
+from numba_dpex.core.exceptions import UncompiledKernelError, UnreachableError
 
 from .kernel_base import KernelInterface
 
@@ -46,78 +39,6 @@ def __init__(self, func, func_name) -> None:
         else:
             raise UnreachableError()
 
-    @global_compiler_lock
-    def _compile(self, args, debug=None, extra_compile_flags=None):
-        """
-        Compiles the function using the dpex compiler pipeline and returns the
-        compiled result.
-
-        Args:
-            args: The list of arguments passed to the kernel.
-            debug (bool): Optional flag to turn on debug mode compilation.
-            extra_compile_flags: Extra flags passed to the compiler.
-
-        Returns:
-            cres: Compiled result.
-
-        Raises:
-            KernelHasReturnValueError: If the compiled function returns a
-            non-void value.
-        """
-        # First compilation will trigger the initialization of the backend.
-        typingctx = dpex_target.typing_context
-        targetctx = dpex_target.target_context
-
-        flags = compiler.Flags()
-        # Do not compile the function to a binary, just lower to LLVM
-        flags.debuginfo = config.DEBUGINFO_DEFAULT
-        flags.no_compile = True
-        flags.no_cpython_wrapper = True
-        flags.nrt = False
-
-        if debug is not None:
-            flags.debuginfo = debug
-
-        # Run compilation pipeline
-        if isinstance(self._func, FunctionType):
-            cres = compiler.compile_extra(
-                typingctx=typingctx,
-                targetctx=targetctx,
-                func=self._func,
-                args=args,
-                return_type=None,
-                flags=flags,
-                locals={},
-                pipeline_class=dpex_compiler.Compiler,
-            )
-        elif isinstance(self._func, ir.FunctionIR):
-            cres = compiler.compile_ir(
-                typingctx=typingctx,
-                targetctx=targetctx,
-                func_ir=self._func,
-                args=args,
-                return_type=None,
-                flags=flags,
-                locals={},
-                pipeline_class=dpex_compiler.Compiler,
-            )
-        else:
-            raise UnreachableError()
-
-        if (
-            cres.signature.return_type is not None
-            and cres.signature.return_type != numba_types.void
-        ):
-            raise KernelHasReturnValueError(
-                kernel_name=self._pyfunc_name,
-                return_type=cres.signature.return_type,
-            )
-        # Linking depending libraries
-        library = cres.library
-        library.finalize()
-
-        return cres
-
     @property
     def llvm_module(self):
         """The LLVM IR Module corresponding to the Kernel instance."""
@@ -158,9 +79,13 @@ def compile(self, arg_types, debug, extra_compile_flags):
 
         logging.debug("compiling SpirvKernel with arg types", arg_types)
 
-        cres = self._compile(
+        cres = _compile_helper.compile_with_dpex(
+            self._func,
+            self._pyfunc_name,
             args=arg_types,
+            return_type=None,
             debug=debug,
+            is_kernel=True,
             extra_compile_flags=extra_compile_flags,
         )
 
diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py
index b936fed90c..be73352009 100644
--- a/numba_dpex/decorators.py
+++ b/numba_dpex/decorators.py
@@ -5,17 +5,19 @@
 import dpctl
 from numba.core import sigutils, types
 
-from numba_dpex.core.exceptions import KernelHasReturnValueError
+from numba_dpex.compiler import JitKernel
 from numba_dpex.core.kernel_interface.dispatcher import (
     Dispatcher,
     get_ordered_arg_access_types,
 )
+from numba_dpex.core.kernel_interface.func import (
+    compile_func,
+    compile_func_template,
+)
 from numba_dpex.utils import npytypes_array_to_dpex_array
 
-from .compiler import JitKernel, compile_func, compile_func_template
-
 
-def kernel(signature=None, access_types=None, debug=None):
+def kernel(func_or_sig=None, access_types=None, debug=None):
     """The decorator to write a numba_dpex kernel function.
 
     A kernel function is conceptually equivalent to a SYCL kernel function, and
@@ -26,13 +28,13 @@ def kernel(signature=None, access_types=None, debug=None):
         * All array arguments passed to a kernel should be of the same type
           and have the same dtype.
     """
-    if signature is None:
+    if func_or_sig is None:
         return autojit(debug=debug, access_types=access_types)
-    elif not sigutils.is_signature(signature):
-        func = signature
+    elif not sigutils.is_signature(func_or_sig):
+        func = func_or_sig
         return autojit(debug=debug, access_types=access_types)(func)
     else:
-        return _kernel_jit(signature, debug, access_types)
+        return _kernel_jit(func_or_sig, debug, access_types)
 
 
 def autojit(debug=None, access_types=None):

From e447f100887d57863d124e57e5b725ea1daceefe Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 6 Dec 2022 23:22:34 -0600
Subject: [PATCH 19/51] Migrate functionality in numba_dpex.compiler module to
 new compiler.

---
 numba_dpex/compiler.py            | 111 +----------------------
 numba_dpex/core/passbuilder.py    | 145 ------------------------------
 numba_dpex/core/passes/lowerer.py |   9 +-
 numba_dpex/offload_dispatcher.py  |   2 +-
 4 files changed, 7 insertions(+), 260 deletions(-)
 delete mode 100644 numba_dpex/core/passbuilder.py

diff --git a/numba_dpex/compiler.py b/numba_dpex/compiler.py
index 1e69df240f..4c01f00c11 100644
--- a/numba_dpex/compiler.py
+++ b/numba_dpex/compiler.py
@@ -13,15 +13,13 @@
 import dpctl.utils
 import numpy as np
 from numba.core import compiler, ir, types
-from numba.core.compiler import CompilerBase, DefaultPassBuilder
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.typing.templates import AbstractTemplate, ConcreteTemplate
 
 from numba_dpex import config
+from numba_dpex.core.compiler import Compiler
 from numba_dpex.core.exceptions import KernelHasReturnValueError
 from numba_dpex.core.types import Array, USMNdArray
 from numba_dpex.dpctl_support import dpctl_version
-from numba_dpex.parfor_diagnostics import ExtendedParforDiagnostics
 from numba_dpex.utils import (
     IndeterminateExecutionQueueError,
     as_usm_obj,
@@ -62,25 +60,6 @@ def _raise_invalid_kernel_enqueue_args():
     raise ValueError(error_message)
 
 
-class Compiler(CompilerBase):
-    """The DPEX compiler pipeline."""
-
-    def define_pipelines(self):
-        # this maintains the objmode fallback behaviour
-        pms = []
-        self.state.parfor_diagnostics = ExtendedParforDiagnostics()
-        self.state.metadata[
-            "parfor_diagnostics"
-        ] = self.state.parfor_diagnostics
-        if not self.state.flags.force_pyobject:
-            pms.append(PassBuilder.define_nopython_pipeline(self.state))
-        if self.state.status.can_fallback or self.state.flags.force_pyobject:
-            pms.append(
-                DefaultPassBuilder.define_objectmode_pipeline(self.state)
-            )
-        return pms
-
-
 @global_compiler_lock
 def compile_with_depx(pyfunc, return_type, args, is_kernel, debug=None):
     """
@@ -244,94 +223,6 @@ def compile_kernel_parfor(
     return oclkern
 
 
-def compile_func(pyfunc, return_type, args, debug=None):
-    cres = compile_with_depx(
-        pyfunc=pyfunc,
-        return_type=return_type,
-        args=args,
-        is_kernel=False,
-        debug=debug,
-    )
-    func = cres.library.get_function(cres.fndesc.llvm_func_name)
-    cres.target_context.mark_ocl_device(func)
-    devfn = DpexFunction(cres)
-
-    class _function_template(ConcreteTemplate):
-        key = devfn
-        cases = [cres.signature]
-
-    cres.typing_context.insert_user_function(devfn, _function_template)
-    libs = [cres.library]
-    cres.target_context.insert_user_function(devfn, cres.fndesc, libs)
-    return devfn
-
-
-def compile_func_template(pyfunc, debug=None):
-    """Compile a DpexFunctionTemplate"""
-    from .core.descriptor import dpex_target
-
-    dft = DpexFunctionTemplate(pyfunc, debug=debug)
-
-    class _function_template(AbstractTemplate):
-        key = dft
-
-        def generic(self, args, kws):
-            assert not kws
-            return dft.compile(args)
-
-    typingctx = dpex_target.typing_context
-    typingctx.insert_user_function(dft, _function_template)
-    return dft
-
-
-class DpexFunctionTemplate(object):
-    """Unmaterialized dpex function"""
-
-    def __init__(self, pyfunc, debug=None):
-        self.py_func = pyfunc
-        self.debug = debug
-        # self.inline = inline
-        self._compileinfos = {}
-
-    def compile(self, args):
-        """Compile the function for the given argument types.
-
-        Each signature is compiled once by caching the compiled function inside
-        this object.
-        """
-        if args not in self._compileinfos:
-            cres = compile_with_depx(
-                pyfunc=self.py_func,
-                return_type=None,
-                args=args,
-                is_kernel=False,
-                debug=self.debug,
-            )
-            func = cres.library.get_function(cres.fndesc.llvm_func_name)
-            cres.target_context.mark_ocl_device(func)
-            first_definition = not self._compileinfos
-            self._compileinfos[args] = cres
-            libs = [cres.library]
-
-            if first_definition:
-                # First definition
-                cres.target_context.insert_user_function(
-                    self, cres.fndesc, libs
-                )
-            else:
-                cres.target_context.add_user_function(self, cres.fndesc, libs)
-
-        else:
-            cres = self._compileinfos[args]
-
-        return cres.signature
-
-
-class DpexFunction(object):
-    def __init__(self, cres):
-        self.cres = cres
-
-
 def _ensure_valid_work_item_grid(val, sycl_queue):
 
     if not isinstance(val, (tuple, list, int)):
diff --git a/numba_dpex/core/passbuilder.py b/numba_dpex/core/passbuilder.py
deleted file mode 100644
index 999b356b11..0000000000
--- a/numba_dpex/core/passbuilder.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from numba.core.compiler_machinery import PassManager
-from numba.core.typed_passes import (
-    AnnotateTypes,
-    InlineOverloads,
-    IRLegalization,
-    NopythonRewrites,
-    NoPythonSupportedFeatureValidation,
-    NopythonTypeInference,
-    PreLowerStripPhis,
-)
-from numba.core.untyped_passes import (
-    DeadBranchPrune,
-    FindLiterallyCalls,
-    FixupArgs,
-    GenericRewrites,
-    InlineClosureLikes,
-    InlineInlinables,
-    IRProcessing,
-    LiteralUnroll,
-    MakeFunctionToJitFunction,
-    ReconstructSSA,
-    RewriteSemanticConstants,
-    TranslateByteCode,
-    WithLifting,
-)
-
-from numba_dpex.core.passes.passes import (
-    ConstantSizeStaticLocalMemoryPass,
-    DpexLowering,
-    DumpParforDiagnostics,
-    NoPythonBackend,
-    ParforPass,
-    PreParforPass,
-)
-from numba_dpex.core.passes.rename_numpy_functions_pass import (
-    RewriteNdarrayFunctionsPass,
-    RewriteOverloadedNumPyFunctionsPass,
-)
-
-
-class PassBuilder(object):
-    """
-    This is a pass builder to run Intel GPU/CPU specific
-    code-generation and optimization passes. This pass builder does
-    not offer objectmode and interpreted passes.
-    """
-
-    @staticmethod
-    def default_numba_nopython_pipeline(state, pm):
-        """Adds the default set of NUMBA passes to the pass manager"""
-        if state.func_ir is None:
-            pm.add_pass(TranslateByteCode, "analyzing bytecode")
-            pm.add_pass(FixupArgs, "fix up args")
-        pm.add_pass(IRProcessing, "processing IR")
-        pm.add_pass(WithLifting, "Handle with contexts")
-
-        # this pass rewrites name of NumPy functions we intend to overload
-        pm.add_pass(
-            RewriteOverloadedNumPyFunctionsPass,
-            "Rewrite name of Numpy functions to overload already overloaded function",
-        )
-
-        # Add pass to ensure when users are allocating static
-        # constant memory the size is a constant and can not
-        # come from a closure variable
-        pm.add_pass(
-            ConstantSizeStaticLocalMemoryPass,
-            "dpex constant size for static local memory",
-        )
-
-        # inline closures early in case they are using nonlocal's
-        # see issue #6585.
-        pm.add_pass(
-            InlineClosureLikes, "inline calls to locally defined closures"
-        )
-
-        # pre typing
-        if not state.flags.no_rewrites:
-            pm.add_pass(RewriteSemanticConstants, "rewrite semantic constants")
-            pm.add_pass(DeadBranchPrune, "dead branch pruning")
-            pm.add_pass(GenericRewrites, "nopython rewrites")
-
-        # convert any remaining closures into functions
-        pm.add_pass(
-            MakeFunctionToJitFunction,
-            "convert make_function into JIT functions",
-        )
-        # inline functions that have been determined as inlinable and rerun
-        # branch pruning, this needs to be run after closures are inlined as
-        # the IR repr of a closure masks call sites if an inlinable is called
-        # inside a closure
-        pm.add_pass(InlineInlinables, "inline inlinable functions")
-        if not state.flags.no_rewrites:
-            pm.add_pass(DeadBranchPrune, "dead branch pruning")
-
-        pm.add_pass(FindLiterallyCalls, "find literally calls")
-        pm.add_pass(LiteralUnroll, "handles literal_unroll")
-
-        if state.flags.enable_ssa:
-            pm.add_pass(ReconstructSSA, "ssa")
-
-        # typing
-        pm.add_pass(NopythonTypeInference, "nopython frontend")
-        pm.add_pass(AnnotateTypes, "annotate types")
-
-        pm.add_pass(
-            RewriteNdarrayFunctionsPass,
-            "Rewrite numpy.ndarray functions to dpnp.ndarray functions",
-        )
-
-        # strip phis
-        pm.add_pass(PreLowerStripPhis, "remove phis nodes")
-
-        # optimisation
-        pm.add_pass(InlineOverloads, "inline overloaded functions")
-
-    @staticmethod
-    def define_nopython_pipeline(state, name="dpex_nopython"):
-        """Returns an nopython mode pipeline based PassManager"""
-        pm = PassManager(name)
-        PassBuilder.default_numba_nopython_pipeline(state, pm)
-
-        # Intel GPU/CPU specific optimizations
-        pm.add_pass(PreParforPass, "Preprocessing for parfors")
-        if not state.flags.no_rewrites:
-            pm.add_pass(NopythonRewrites, "nopython rewrites")
-        pm.add_pass(ParforPass, "convert to parfors")
-
-        # legalise
-        pm.add_pass(
-            NoPythonSupportedFeatureValidation,
-            "ensure features that are in use are in a valid form",
-        )
-        pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
-
-        # lower
-        pm.add_pass(DpexLowering, "Custom Lowerer with auto-offload support")
-        pm.add_pass(NoPythonBackend, "nopython mode backend")
-        pm.add_pass(DumpParforDiagnostics, "dump parfor diagnostics")
-        pm.finalize()
-        return pm
diff --git a/numba_dpex/core/passes/lowerer.py b/numba_dpex/core/passes/lowerer.py
index d48926d265..8dfde3e6c9 100644
--- a/numba_dpex/core/passes/lowerer.py
+++ b/numba_dpex/core/passes/lowerer.py
@@ -1065,7 +1065,7 @@ def relatively_deep_copy(obj, memo):
     from numba.core.typing.templates import Signature
     from numba.np.ufunc.dufunc import DUFunc
 
-    from numba_dpex.compiler import DpexFunctionTemplate
+    from numba_dpex.core.kernel_interface.func import DpexFunctionTemplate
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
     if isinstance(
@@ -1198,14 +1198,15 @@ def relatively_deep_copy(obj, memo):
         memo[obj_id] = cpy
         return cpy
 
-    # some python objects are not copyable. In such case exception would be raised
-    # it is just a convinient point to find such objects
+    # some python objects are not copyable. In such case exception would be
+    # raised it is just a convinient point to find such objects
     try:
         cpy = copy.copy(obj)
     except Exception as e:
         raise e
 
-    # __slots__ for subclass specify only members declared in subclass. So to get all members we need to go through
+    # __slots__ for subclass specify only members declared in subclass. So to
+    # get all members we need to go through
     # all supeclasses
     def get_slots_members(obj):
         keys = []
diff --git a/numba_dpex/offload_dispatcher.py b/numba_dpex/offload_dispatcher.py
index 172607f813..8efb760adc 100644
--- a/numba_dpex/offload_dispatcher.py
+++ b/numba_dpex/offload_dispatcher.py
@@ -22,7 +22,7 @@ def __init__(
         pipeline_class=compiler.Compiler,
     ):
         if dpex_config.HAS_NON_HOST_DEVICE:
-            from numba_dpex.compiler import Compiler
+            from numba_dpex.core.compiler import Compiler
 
             targetoptions["parallel"] = True
             dispatcher.Dispatcher.__init__(

From ce53d58d6ddce4b2fe1fe7366838f8f32761f124 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 6 Dec 2022 18:39:16 -0600
Subject: [PATCH 20/51] Mark the caching tests as xfail.

---
 numba_dpex/tests/kernel_tests/test_caching.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba_dpex/tests/kernel_tests/test_caching.py b/numba_dpex/tests/kernel_tests/test_caching.py
index 09ceb27d57..cfe0c6dcee 100644
--- a/numba_dpex/tests/kernel_tests/test_caching.py
+++ b/numba_dpex/tests/kernel_tests/test_caching.py
@@ -10,6 +10,7 @@
 from numba_dpex.tests._helper import filter_strings
 
 
+@pytest.mark.xfail
 @pytest.mark.parametrize("filter_str", filter_strings)
 def test_caching_kernel_using_same_queue(filter_str):
     """Test kernel caching when the same queue is used to submit a kernel
@@ -42,6 +43,7 @@ def data_parallel_sum(a, b, c):
             assert _kernel == cached_kernel
 
 
+@pytest.mark.xfail
 @pytest.mark.parametrize("filter_str", filter_strings)
 def test_caching_kernel_using_same_context(filter_str):
     """Test kernel caching for the scenario where different SYCL queues that

From 5ffc555536247a896a99f1cc0cdf6a809ebbff2d Mon Sep 17 00:00:00 2001
From: "akmkhale@ansatnuc04" <akm.khaled.talukder@intel.com>
Date: Thu, 8 Dec 2022 02:08:11 -0600
Subject: [PATCH 21/51] A new LRU cache backed by numba's pickling mechanism

---
 numba_dpex/codegen.py                         |   3 +
 numba_dpex/config.py                          |  15 +
 numba_dpex/core/caching.py                    | 503 ++++++++++++++++++
 .../core/kernel_interface/dispatcher.py       |  80 ++-
 .../core/kernel_interface/spirv_kernel.py     |   8 +
 numba_dpex/decorators.py                      |  19 +-
 numba_dpex/tests/kernel_tests/test_caching.py |  90 ++--
 7 files changed, 645 insertions(+), 73 deletions(-)
 create mode 100644 numba_dpex/core/caching.py

diff --git a/numba_dpex/codegen.py b/numba_dpex/codegen.py
index b54d46741c..9f45b444d5 100644
--- a/numba_dpex/codegen.py
+++ b/numba_dpex/codegen.py
@@ -66,6 +66,9 @@ def _init(self, llvm_module):
         assert list(llvm_module.global_variables) == [], "Module isn't empty"
         self._data_layout = SPIR_DATA_LAYOUT[utils.MACHINE_BITS]
         self._target_data = ll.create_target_data(self._data_layout)
+        self._tm_features = (
+            ""  # We need this for chaching, not sure about this value for now
+        )
 
     def _create_empty_module(self, name):
         ir_module = lc.Module(name)
diff --git a/numba_dpex/config.py b/numba_dpex/config.py
index 3cbbca2ba1..7fc121d059 100644
--- a/numba_dpex/config.py
+++ b/numba_dpex/config.py
@@ -86,6 +86,21 @@ def __getattr__(name):
     "NUMBA_DPEX_DEBUGINFO", int, config.DEBUGINFO_DEFAULT
 )
 
+# configs for caching
+# To see the debug messages for the caching.
+# Execute like:
+#   NUMBA_DPEX_DEBUG_CACHE=1 python <code>
+DEBUG_CACHE = _readenv("NUMBA_DPEX_DEBUG_CACHE", int, 0)
+# This is a global flag to turn the caching on/off,
+# regardless of whatever has been specified in Dispatcher.
+# Useful for debugging. Execute like:
+#   NUMBA_DPEX_ENABLE_CACHE=0 python <code>
+# to turn off the caching globally.
+ENABLE_CACHE = _readenv("NUMBA_DPEX_ENABLE_CACHE", int, 1)
+# Capacity of the cache, execute it like:
+#   NUMBA_DPEX_CACHE_SIZE=20 python <code>
+CACHE_SIZE = _readenv("NUMBA_DPEX_CACHE_SIZE", int, 10)
+
 TESTING_SKIP_NO_DPNP = _readenv("NUMBA_DPEX_TESTING_SKIP_NO_DPNP", int, 0)
 TESTING_SKIP_NO_DEBUGGING = _readenv(
     "NUMBA_DPEX_TESTING_SKIP_NO_DEBUGGING", int, 1
diff --git a/numba_dpex/core/caching.py b/numba_dpex/core/caching.py
new file mode 100644
index 0000000000..1810c52b5d
--- /dev/null
+++ b/numba_dpex/core/caching.py
@@ -0,0 +1,503 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import hashlib
+import sys
+from abc import ABCMeta, abstractmethod
+
+from numba.core.caching import CacheImpl, IndexDataCacheFile
+from numba.core.serialize import dumps
+
+from numba_dpex import config
+
+
+def build_key(sig, argtypes, pyfunc, codegen, backend=None, device_type=None):
+    """Constructs a key from python function, context, backend
+        and the device type.
+
+    Compute index key for the given signature and codegen.
+    It includes a description of the OS, target architecture
+    and hashes of the bytecode for the function and, if the
+    function has a __closure__, a hash of the cell_contents.
+
+    Args:
+        sig (inspect.Signature): The signature object of
+            a python function.
+        codegen (numba.core.codegen.Codegen):
+            The codegen object found from the target context.
+        backend (enum, optional): A 'backend_type' enum.
+            Defaults to None.
+        device_type (enum, optional): A 'device_type' enum.
+            Defaults to None.
+
+    Returns:
+        tuple: A tuple of signature, magic_tuple of codegen
+            and another tuple of hashcodes from bytecode and
+            cell_contents.
+    """
+
+    codebytes = pyfunc.__code__.co_code
+    if pyfunc.__closure__ is not None:
+        try:
+            cvars = tuple([x.cell_contents for x in pyfunc.__closure__])
+            # Note: cloudpickle serializes a function differently depending
+            #       on how the process is launched; e.g. multiprocessing.Process
+            cvarbytes = dumps(cvars)
+        except:
+            cvarbytes = b""  # a temporary solution for function template
+    else:
+        cvarbytes = b""
+
+    return (
+        sig,
+        argtypes,
+        codegen.magic_tuple(),
+        backend,
+        device_type,
+        (
+            hashlib.sha256(codebytes).hexdigest(),
+            hashlib.sha256(cvarbytes).hexdigest(),
+        ),
+    )
+
+
+class _CacheImpl(CacheImpl):
+    """Implementation of `CacheImpl` to be used by subclasses of `_Cache`.
+
+    This class is an implementation of `CacheImpl` to be used by subclasses
+    of `_Cache`. To be assigned in `_impl_class`. Implements the more common
+    and core mechanism for the caching.
+
+    """
+
+    def reduce(self, data):
+        """Serialize an object before caching.
+        Args:
+            data (object): The object to be serialized before pickling.
+        """
+        # TODO: Implement, but looks like we might not need it at all.
+        # Look at numba.core.caching for how to implement.
+        pass
+
+    def rebuild(self, target_context, reduced_data):
+        """Deserialize after unpickling from the cache.
+        Args:
+            target_context (numba_dpex.core.target.DpexTargetContext):
+                The target context for the kernel.
+            reduced_data (object): The data to be deserialzed after unpickling.
+        """
+        # TODO: Implement, but looks like we might not need it at all.
+        # Look at numba.core.caching for how to implement.
+        pass
+
+    def check_cachable(self, cres):
+        """Check if a certain object is cacheable.
+
+        Args:
+            cres (object): The object to be cached. For example, if the object is
+            `CompileResult`, then you might want to follow the similar checks as
+            has been done in `numba.core.caching.CompileResultCacheImpl`.
+
+        Returns:
+            bool: Return `True` if cacheable, otherwise `False`.
+        """
+        # TODO: Although, for the time being, assuming all Kernels in numba_dpex
+        # are always cachable. However, we might need to add some bells and
+        # whistles in the future. Look at numba.core.caching for how to implement.
+        return True
+
+
+class AbstractCache(metaclass=ABCMeta):
+    """Abstract cache class to specify basic caching operations.
+
+    This class will be used to create an non-functional dummy cache
+    (i.e. NullCache) and other functional cache. The dummy cache
+    will be used as a placeholder when caching is disabled.
+
+    Args:
+        metaclass (type, optional): Metaclass for the abstract class.
+            Defaults to ABCMeta.
+    """
+
+    @abstractmethod
+    def get(self):
+        """An abstract method to retrieve item from the cache."""
+
+    @abstractmethod
+    def put(self, key, value):
+        """An abstract method to save item into the cache.
+
+        Args:
+            key (object): The key for the data
+                (i.e. compiled kernel/function etc.).
+            value (object): The data (i.e. compiled kernel/function)
+                to be saved.
+        """
+
+
+class NullCache(AbstractCache):
+    """A dummy cache used if user decides to disable caching.
+
+    If the caching is disabled this class will be used to
+    perform all caching operations, all of which will be basically
+    NOP. This idea is copied from numba.
+
+    Args:
+        AbstractCache (class): The abstract cache from which all
+        other caching classes will be derived.
+    """
+
+    def get(self, key):
+        """Function to get an item (i.e. compiled kernel/function)
+        from the cache
+
+        Args:
+            key (object): The key to retrieve the
+                data (i.e. compiled kernel/function)
+
+        Returns:
+            None: Returns None.
+        """
+        return None
+
+    def put(self, key, value):
+        """Function to save a compiled kernel/function
+        into the cache.
+
+        Args:
+            key (object): The key to the data (i.e. compiled kernel/function).
+            value (object): The data to be cached (i.e. compiled kernel/function).
+        """
+        pass
+
+
+class Node:
+    """A 'Node' class for LRUCache."""
+
+    def __init__(self, key, value):
+        """Constructor for the Node.
+
+        Args:
+            key (object): The key to the value.
+            value (object): The data to be saved.
+        """
+        self.key = key
+        self.value = value
+        self.next = None
+        self.previous = None
+
+    def __str__(self):
+        """__str__ for Node.
+
+        Returns:
+            str: A human readable representation of a Node.
+        """
+        return "(" + str(self.key) + ": " + str(self.value) + ")"
+
+    def __repr__(self):
+        """__repr__ for Node
+
+        Returns:
+            str: A human readable representation of a Node.
+        """
+        return self.__str__()
+
+
+class LRUCache(AbstractCache):
+    """LRUCache implementation for caching kernels,
+    functions and modules.
+
+    The cache is basically a doubly-linked-list backed
+    with a dictionary as a lookup table.
+    """
+
+    def __init__(self, capacity=10, pyfunc=None):
+        """Constructor for LRUCache.
+
+        Args:
+            capacity (int, optional): The max capacity of the cache.
+                Defaults to 10.
+            pyfunc (NoneType, optional): A python function to be cached.
+                Defaults to None.
+        """
+        self._capacity = capacity
+        self._lookup = {}
+        self._evicted = {}
+        self._dummy = Node(0, 0)
+        self._head = self._dummy.next
+        self._tail = self._dummy.next
+        self._pyfunc = pyfunc
+        self._cache_file = None
+        # if pyfunc is specified, we will use files for evicted items
+        if self._pyfunc is not None:
+            # _CacheImpl object to be used
+            self._impl_class = _CacheImpl
+            self._impl = self._impl_class(self._pyfunc)
+            self._cache_path = self._impl.locator.get_cache_path()
+            # This may be a bit strict but avoids us maintaining a magic number
+            source_stamp = self._impl.locator.get_source_stamp()
+            filename_base = self._impl.filename_base
+            self._cache_file = IndexDataCacheFile(
+                cache_path=self._cache_path,
+                filename_base=filename_base,
+                source_stamp=source_stamp,
+            )
+
+    @property
+    def head(self):
+        """Get the head of the cache.
+
+        This is used for testing/debugging purposes.
+
+        Returns:
+            Node: The head of the cache.
+        """
+        return self._head
+
+    @property
+    def tail(self):
+        """Get the tail of the cache.
+
+        This is used for testing/debugging purposes.
+
+        Returns:
+            Node: The tail of the cache.
+        """
+        return self._tail
+
+    @property
+    def evicted(self):
+        """Get the list of evicted items from the cache.
+
+        This is used for testing/debugging purposes.
+
+        Returns:
+            dict: A table of evicted items from the cache.
+        """
+        return self._evicted
+
+    def _get_memsize(self, obj, seen=None):
+        """Recursively finds size of *almost any* object.
+
+        This function might be useful in the future when
+        size based (not count based) cache limit will be
+        implemented.
+
+        Args:
+            obj (object): Any object.
+            seen (set, optional): Set of seen object id().
+                Defaults to None.
+
+        Returns:
+            int: Size of the object in bytes.
+        """
+        size = sys.getsizeof(obj)
+        if seen is None:
+            seen = set()
+        obj_id = id(obj)
+        if obj_id in seen:
+            return 0
+        # Important mark as seen *before* entering recursion to gracefully handle
+        # self-referential objects
+        seen.add(obj_id)
+        if isinstance(obj, dict):
+            size += sum([self._get_memsize(v, seen) for v in obj.values()])
+            size += sum([self._get_memsize(k, seen) for k in obj.keys()])
+        elif hasattr(obj, "__dict__"):
+            size += self._get_memsize(obj.__dict__, seen)
+        elif hasattr(obj, "__iter__") and not isinstance(
+            obj, (str, bytes, bytearray)
+        ):
+            size += sum([self._get_memsize(i, seen) for i in obj])
+        return size
+
+    def size(self):
+        """Get the current size of the cache.
+
+        Returns:
+            int: The current number of items in the cache.
+        """
+        return len(self._lookup)
+
+    def memsize(self):
+        """Get the total memory size of the cache.
+
+        This function might be useful in the future when
+        size based (not count based) cache limit will be
+        implemented.
+
+        Returns:
+            int: Get the total memory size of the cache in bytes.
+        """
+        size = 0
+        current = self._head
+        while current:
+            size = size + self._get_memsize(current.value)
+            current = current.next
+        return size
+
+    def __str__(self):
+        """__str__ function for the cache
+
+        Returns:
+            str: A human readable representation of the cache.
+        """
+        items = []
+        current = self._head
+        while current:
+            items.append(str(current))
+            current = current.next
+        return "{" + ", ".join(items) + "}"
+
+    def __repr__(self):
+        """__repr__ function for the cache
+
+        Returns:
+            str: A human readable representation of the cache.
+        """
+        return self.__str__()
+
+    def clean(self):
+        """Clean the cache"""
+        self._lookup = {}
+        self._evicted = {}
+        self._dummy = Node(0, 0)
+        self._head = self._dummy.next
+        self._tail = self._dummy.next
+
+    def _remove_head(self):
+        """Remove the head of the cache"""
+        if not self._head:
+            return
+        prev = self._head
+        self._head = self._head.next
+        if self._head:
+            self._head.previous = None
+        del prev
+
+    def _append_tail(self, new_node):
+        """Add the new node to the tail end"""
+        if not self._tail:
+            self._head = self._tail = new_node
+        else:
+            self._tail.next = new_node
+            new_node.previous = self._tail
+            self._tail = self._tail.next
+
+    def _unlink_node(self, node):
+        """Unlink current linked node"""
+        if node is None:
+            return
+
+        if self._head is node:
+            self._head = node.next
+            if node.next:
+                node.next.previous = None
+            node.previous, node.next = None, None
+            return
+
+        # removing the node from somewhere in the middle; update pointers
+        prev, nex = node.previous, node.next
+        prev.next = nex
+        nex.previous = prev
+        node.previous, node.next = None, None
+
+    def get(self, key):
+        """Get the value associated with the key.
+
+        Args:
+            key (object): A key for the lookup table.
+
+        Returns:
+            object: The value associated with the key.
+        """
+
+        if key not in self._lookup:
+            if key not in self._evicted:
+                return None
+            elif self._cache_file:
+                value = self._cache_file.load(key)
+                if config.DEBUG_CACHE:
+                    print(
+                        "[cache]: unpickled an evicted artifact, key: {0:s}.".format(
+                            str(key)
+                        )
+                    )
+            else:
+                value = self._evicted[key]
+            self.put(key, value)
+            return value
+        else:
+            if config.DEBUG_CACHE:
+                print(
+                    "[cache] size: {0:d}, loading artifact, key: {1:s}".format(
+                        len(self._lookup), str(key)
+                    )
+                )
+            node = self._lookup[key]
+
+        if node is not self._tail:
+            self._unlink_node(node)
+            self._append_tail(node)
+
+        return node.value
+
+    def put(self, key, value):
+        """Store the key-value pair into the cache.
+
+        Args:
+            key (object): The key for the data.
+            value (object): The data to be saved.
+        """
+        if key in self._lookup:
+            if config.DEBUG_CACHE:
+                print(
+                    "[cache] size: {0:d}, storing artifact, key: {1:s}".format(
+                        len(self._lookup), str(key)
+                    )
+                )
+            self._lookup[key].value = value
+            self.get(key)
+            return
+
+        if key in self._evicted:
+            self._evicted.pop(key)
+
+        if len(self._lookup) >= self._capacity:
+            # remove head node and correspond key
+            if self._cache_file:
+                if config.DEBUG_CACHE:
+                    print(
+                        "[cache] size: {0:d}, pickling the LRU item, key: {1:s}, indexed at {2:s}.".format(
+                            len(self._lookup),
+                            str(self._head.key),
+                            self._cache_file._index_path,
+                        )
+                    )
+                self._cache_file.save(self._head.key, self._head.value)
+                self._evicted[
+                    self._head.key
+                ] = None  # as we are using cache files, we save memory
+            else:
+                self._evicted[self._head.key] = self._head.value
+            self._lookup.pop(self._head.key)
+            if config.DEBUG_CACHE:
+                print(
+                    "[cache] size: {0:d}, capacity exceeded, evicted".format(
+                        len(self._lookup)
+                    ),
+                    self._head.key,
+                )
+            self._remove_head()
+
+        # add new node and hash key
+        new_node = Node(key, value)
+        self._lookup[key] = new_node
+        self._append_tail(new_node)
+        if config.DEBUG_CACHE:
+            print(
+                "[cache] size: {0:d}, saved artifact, key: {1:s}".format(
+                    len(self._lookup), str(key)
+                )
+            )
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 9cf8fc3e09..4c10055683 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -2,15 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+
 import copy
 from inspect import signature
 from warnings import warn
 
 import dpctl
 import dpctl.program as dpctl_prog
+from numba.core import utils
 from numba.core.types import Array as ArrayType
 
 from numba_dpex import config
+from numba_dpex.core.caching import LRUCache, NullCache, build_key
 from numba_dpex.core.descriptor import dpex_target
 from numba_dpex.core.exceptions import (
     ComputeFollowsDataInferenceError,
@@ -57,6 +60,7 @@ def __init__(
         debug_flags=None,
         compile_flags=None,
         array_access_specifiers=None,
+        enable_cache=True,
     ):
         self.typingctx = dpex_target.typing_context
         self.pyfunc = pyfunc
@@ -66,6 +70,16 @@ def __init__(
         # TODO: To be removed once the__getitem__ is removed
         self._global_range = None
         self._local_range = None
+        # caching related attributes
+        if not config.ENABLE_CACHE:
+            self._cache = NullCache()
+        elif enable_cache:
+            self._cache = LRUCache(
+                capacity=config.CACHE_SIZE, pyfunc=self.pyfunc
+            )
+        else:
+            self._cache = NullCache()
+        self._cache_hits = 0
 
         if array_access_specifiers:
             warn(
@@ -85,6 +99,22 @@ def __init__(
         else:
             self._create_sycl_kernel_bundle_flags = []
 
+    @property
+    def cache(self):
+        return self._cache
+
+    @property
+    def cache_hits(self):
+        return self._cache_hits
+
+    def enable_caching(self):
+        if not config.ENABLE_CACHE:
+            self._cache = NullCache()
+        else:
+            self._cache = LRUCache(
+                capacity=config.CACHE_SIZE, pyfunc=self.pyfunc
+            )
+
     def _check_range(self, range, device):
 
         if not isinstance(range, (tuple, list)):
@@ -406,6 +436,7 @@ def __call__(self, *args, global_range=None, local_range=None):
 
         exec_queue = self._determine_kernel_launch_queue(args, argtypes)
         backend = exec_queue.backend
+        device_type = exec_queue.sycl_device.device_type
 
         if exec_queue.backend not in [
             dpctl.backend_type.opencl,
@@ -420,24 +451,53 @@ def __call__(self, *args, global_range=None, local_range=None):
             global_range, local_range, exec_queue.sycl_device
         )
 
-        # TODO: Enable caching of kernels, but do it using Numba's caching
-        # machinery
-
-        kernel = SpirvKernel(self.pyfunc, self.kernel_name)
-        kernel.compile(
-            arg_types=argtypes,
-            debug=self.debug_flags,
-            extra_compile_flags=self.compile_flags,
+        # TODO: Enable caching of kernels, but do it using LRU
+        # caching and numba's pickle framework.
+
+        # load the kernel from cache
+        sig = utils.pysignature(self.pyfunc)
+        key = build_key(
+            sig,
+            tuple(argtypes),
+            self.pyfunc,
+            dpex_target.target_context.codegen(),
+            backend=backend,
+            device_type=device_type,
         )
+        artifact = self._cache.get(key)
+        # if it's not cached, i.e. first time
+        if artifact is not None:
+            device_driver_ir_module, kernel_module_name = artifact
+            self._cache_hits += 1
+        else:
+            kernel = SpirvKernel(self.pyfunc, self.kernel_name)
+            kernel.compile(
+                arg_types=argtypes,
+                debug=self.debug_flags,
+                extra_compile_flags=self.compile_flags,
+            )
+
+            device_driver_ir_module = kernel.device_driver_ir_module
+            kernel_module_name = kernel.module_name
+
+            key = build_key(
+                sig,
+                tuple(argtypes),
+                self.pyfunc,
+                kernel.target_context.codegen(),
+                backend=backend,
+                device_type=device_type,
+            )
+            self._cache.put(key, (device_driver_ir_module, kernel_module_name))
 
         # create a sycl::KernelBundle
         kernel_bundle = dpctl_prog.create_program_from_spirv(
             exec_queue,
-            kernel.device_driver_ir_module,
+            device_driver_ir_module,
             " ".join(self._create_sycl_kernel_bundle_flags),
         )
         #  get the sycl::kernel
-        kernel = kernel_bundle.get_sycl_kernel(kernel.module_name)
+        kernel = kernel_bundle.get_sycl_kernel(kernel_module_name)
 
         packer = Packer(
             kernel_name=self.kernel_name,
diff --git a/numba_dpex/core/kernel_interface/spirv_kernel.py b/numba_dpex/core/kernel_interface/spirv_kernel.py
index efbf438da4..cb11f345b5 100644
--- a/numba_dpex/core/kernel_interface/spirv_kernel.py
+++ b/numba_dpex/core/kernel_interface/spirv_kernel.py
@@ -38,6 +38,7 @@ def __init__(self, func, func_name) -> None:
             self._func_ty = ir.FunctionIR
         else:
             raise UnreachableError()
+        self._target_context = None
 
     @property
     def llvm_module(self):
@@ -68,6 +69,13 @@ def module_name(self):
         else:
             raise UncompiledKernelError(self._pyfunc_name)
 
+    @property
+    def target_context(self):
+        if self._target_context:
+            return self._target_context
+        else:
+            raise UncompiledKernelError(self._pyfunc_name)
+
     def compile(self, arg_types, debug, extra_compile_flags):
         """_summary_
 
diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py
index be73352009..efb05b1c2e 100644
--- a/numba_dpex/decorators.py
+++ b/numba_dpex/decorators.py
@@ -17,7 +17,7 @@
 from numba_dpex.utils import npytypes_array_to_dpex_array
 
 
-def kernel(func_or_sig=None, access_types=None, debug=None):
+def kernel(func_or_sig=None, access_types=None, debug=None, enable_cache=True):
     """The decorator to write a numba_dpex kernel function.
 
     A kernel function is conceptually equivalent to a SYCL kernel function, and
@@ -29,15 +29,21 @@ def kernel(func_or_sig=None, access_types=None, debug=None):
           and have the same dtype.
     """
     if func_or_sig is None:
-        return autojit(debug=debug, access_types=access_types)
+        return autojit(
+            debug=debug, access_types=access_types, enable_cache=enable_cache
+        )
     elif not sigutils.is_signature(func_or_sig):
         func = func_or_sig
-        return autojit(debug=debug, access_types=access_types)(func)
+        return autojit(
+            debug=debug, access_types=access_types, enable_cache=enable_cache
+        )(func)
     else:
-        return _kernel_jit(func_or_sig, debug, access_types)
+        return _kernel_jit(
+            func_or_sig, debug, access_types, enable_cache=enable_cache
+        )
 
 
-def autojit(debug=None, access_types=None):
+def autojit(debug=None, access_types=None, enable_cache=True):
     def _kernel_dispatcher(pyfunc):
         ordered_arg_access_types = get_ordered_arg_access_types(
             pyfunc, access_types
@@ -46,12 +52,13 @@ def _kernel_dispatcher(pyfunc):
             pyfunc=pyfunc,
             debug_flags=debug,
             array_access_specifiers=ordered_arg_access_types,
+            enable_cache=enable_cache,
         )
 
     return _kernel_dispatcher
 
 
-def _kernel_jit(signature, debug, access_types):
+def _kernel_jit(signature, debug, access_types, enable_cache=True):
     argtypes, rettype = sigutils.normalize_signature(signature)
     argtypes = tuple(
         [
diff --git a/numba_dpex/tests/kernel_tests/test_caching.py b/numba_dpex/tests/kernel_tests/test_caching.py
index cfe0c6dcee..25385fa3a8 100644
--- a/numba_dpex/tests/kernel_tests/test_caching.py
+++ b/numba_dpex/tests/kernel_tests/test_caching.py
@@ -3,78 +3,54 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpctl
+import dpctl.tensor as dpt
 import numpy as np
 import pytest
 
 import numba_dpex as dpex
+from numba_dpex.core.kernel_interface.dispatcher import (
+    Dispatcher,
+    get_ordered_arg_access_types,
+)
 from numba_dpex.tests._helper import filter_strings
 
 
-@pytest.mark.xfail
 @pytest.mark.parametrize("filter_str", filter_strings)
-def test_caching_kernel_using_same_queue(filter_str):
-    """Test kernel caching when the same queue is used to submit a kernel
-    multiple times.
+def test_caching_hit_counts(filter_str):
+    """Tests the correct number of cache hits.
+    If a Dispatcher is invoked 10 times and if the caching is enabled,
+    then the total number of cache hits will be 9. Given the fact that
+    the first time the kernel will be compiled and it will be loaded
+    off the cache for the next time on.
 
     Args:
-        filter_str: SYCL filter selector string
+        filter_str (str): The device name coming from filter_strings in
+        ._helper.py
     """
-    global_size = 10
-    N = global_size
 
-    def data_parallel_sum(a, b, c):
+    def data_parallel_sum(x, y, z):
+        """
+        Vector addition using the ``kernel`` decorator.
+        """
         i = dpex.get_global_id(0)
-        c[i] = a[i] + b[i]
+        z[i] = x[i] + y[i]
 
-    a = np.array(np.random.random(N), dtype=np.float32)
-    b = np.array(np.random.random(N), dtype=np.float32)
-    c = np.ones_like(a)
+    a = dpt.arange(0, 100, device=filter_str)
+    b = dpt.arange(0, 100, device=filter_str)
+    c = dpt.zeros_like(a, device=filter_str)
 
-    with dpctl.device_context(filter_str) as gpu_queue:
-        func = dpex.kernel(data_parallel_sum)
-        cached_kernel = func[global_size, dpex.DEFAULT_LOCAL_SIZE].specialize(
-            func._get_argtypes(a, b, c), gpu_queue
-        )
+    expected = dpt.asnumpy(a) + dpt.asnumpy(b)
 
-        for i in range(10):
-            _kernel = func[global_size, dpex.DEFAULT_LOCAL_SIZE].specialize(
-                func._get_argtypes(a, b, c), gpu_queue
-            )
-            assert _kernel == cached_kernel
-
-
-@pytest.mark.xfail
-@pytest.mark.parametrize("filter_str", filter_strings)
-def test_caching_kernel_using_same_context(filter_str):
-    """Test kernel caching for the scenario where different SYCL queues that
-    share a SYCL context are used to submit a kernel.
-
-    Args:
-        filter_str: SYCL filter selector string
-    """
-    global_size = 10
-    N = global_size
-
-    def data_parallel_sum(a, b, c):
-        i = dpex.get_global_id(0)
-        c[i] = a[i] + b[i]
+    d = Dispatcher(
+        data_parallel_sum,
+        array_access_specifiers=get_ordered_arg_access_types(
+            data_parallel_sum, None
+        ),
+    )
 
-    a = np.array(np.random.random(N), dtype=np.float32)
-    b = np.array(np.random.random(N), dtype=np.float32)
-    c = np.ones_like(a)
+    N = 10
+    for i in range(N):
+        d(a, b, c, global_range=[100])
+    actual = dpt.asnumpy(c)
 
-    # Set the global queue to the default device so that the cached_kernel gets
-    # created for that device
-    dpctl.set_global_queue(filter_str)
-    func = dpex.kernel(data_parallel_sum)
-    default_queue = dpctl.get_current_queue()
-    cached_kernel = func[global_size, dpex.DEFAULT_LOCAL_SIZE].specialize(
-        func._get_argtypes(a, b, c), default_queue
-    )
-    for i in range(0, 10):
-        # Each iteration create a fresh queue that will share the same context
-        with dpctl.device_context(filter_str) as gpu_queue:
-            _kernel = func[global_size, dpex.DEFAULT_LOCAL_SIZE].specialize(
-                func._get_argtypes(a, b, c), gpu_queue
-            )
-            assert _kernel == cached_kernel
+    assert np.array_equal(expected, actual) and (d.cache_hits == N - 1)

From 53c5f80478478b6017522dffccb27e8567a3704f Mon Sep 17 00:00:00 2001
From: "akmkhale@ansatnuc04" <akm.khaled.talukder@intel.com>
Date: Tue, 13 Dec 2022 02:17:32 -0600
Subject: [PATCH 22/51] Added rst for caching

Fix docs rst
---
 docs/developer_guides/caching.rst | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 docs/developer_guides/caching.rst

diff --git a/docs/developer_guides/caching.rst b/docs/developer_guides/caching.rst
new file mode 100644
index 0000000000..49186ca472
--- /dev/null
+++ b/docs/developer_guides/caching.rst
@@ -0,0 +1,24 @@
+.. _caching:
+
+Caching Mechanism in Numba-dpex
+================================
+
+Caching is done by saving the compiled kernel code, the ELF object of the executable code. By using the kernel code, cached kernels have minimal overhead because no compilation is needed.
+
+Unlike Numba, we do not perform file-based caching, instead we use an Least Recently Used (LRU) caching mechanism. However when a kernel needs to be evicted, we utilize numba's file-based caching mechanism described `here <https://numba.pydata.org/numba-doc/latest/developer/caching.html>`_.
+
+Algorithm
+==========
+
+The caching mechanism for Numba-dpex works as follows: The cache is an LRU cache backed by an ordered dictionary mapped onto a doubly linked list. The tail of the list contains the most recently used (MRU) kernel and the head of the list contains the least recently used (LRU) kernel. The list  has a fixed size. If a new kernel arrives to be cached and if the size is already on the maximum limit, the algorithm evicts the LRU kernel to make room for the MRU kernel. The evicted item will be serialized and pickled into a file using Numba's caching mechanism.
+
+Everytime whenever a kernel needs to be retrieved from the cache, the mechanism will look for the kernel in the cache and will be loaded if it's already present. However, if the program is seeking for a kernel that has been evicted, the algorithm will load it from the file and enqueue in the cache.
+
+Settings
+========
+
+Therefore, we employ similar environment variables as used in Numba, i.e. ``NUMBA_CACHE_DIR`` etc. However we add three more environment variables to control the caching mechanism.
+
+- In order to specify cache capacity, one can use ``NUMBA_DPEX_CACHE_SIZE``. By default, it's set to 10.
+- ``NUMBA_DPEX_ENABLE_CACHE`` can be used to enable/disable the caching mechanism. By default it's enabled, i.e. set to 1.
+- In order to enable the debugging messages related to caching, one can set ``NUMBA_DPEX_DEBUG_CACHE`` to 1. All environment variables are defined in :file:`numba_dpex/config.py`.

From c3e661b8385a8a6acc91a17bd3b7368cd24696b1 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Mon, 19 Dec 2022 10:17:58 -0600
Subject: [PATCH 23/51] Fix type import.

---
 numba_dpex/core/kernel_interface/arg_pack_unpacker.py | 4 ++--
 numba_dpex/core/kernel_interface/dispatcher.py        | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
index 0c63dfa1e5..40c3babb89 100644
--- a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
+++ b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
@@ -15,7 +15,7 @@
     UnsupportedAccessQualifierError,
     UnsupportedKernelArgumentError,
 )
-from numba_dpex.dpctl_iface import USMNdArrayType
+from numba_dpex.core.types import USMNdArray
 
 
 class _NumPyArrayPackerPayload:
@@ -216,7 +216,7 @@ def _unpack_argument(self, ty, val, access_specifier):
 
         """
 
-        if isinstance(ty, USMNdArrayType):
+        if isinstance(ty, USMNdArray):
             return self._unpack_usm_array(val)
         elif isinstance(ty, types.Array):
             return self._unpack_array(val, access_specifier)
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 4c10055683..26a8c5866a 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -28,7 +28,7 @@
 )
 from numba_dpex.core.kernel_interface.arg_pack_unpacker import Packer
 from numba_dpex.core.kernel_interface.spirv_kernel import SpirvKernel
-from numba_dpex.dpctl_iface import USMNdArrayType
+from numba_dpex.core.types import USMNdArray
 
 
 def get_ordered_arg_access_types(pyfunc, access_types):
@@ -223,17 +223,17 @@ def _determine_kernel_launch_queue(self, args, argtypes):
             ExecutionQueueInferenceError: If the queue could not be inferred
                 using the dpctl queue manager.
         """
-        # Temporary workaround as USMNdArrayType derives from Array
+        # Temporary workaround as USMNdArray derives from Array
         array_argnums = [
             i
             for i, arg in enumerate(args)
             if isinstance(argtypes[i], ArrayType)
-            and not isinstance(argtypes[i], USMNdArrayType)
+            and not isinstance(argtypes[i], USMNdArray)
         ]
         usmarray_argnums = [
             i
             for i, arg in enumerate(args)
-            if isinstance(argtypes[i], USMNdArrayType)
+            if isinstance(argtypes[i], USMNdArray)
         ]
 
         # if usm and non-usm array arguments are getting mixed, then the

From d718a0ea6c8590090a717abbe4425c48dcd59dca Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 27 Dec 2022 12:50:48 -0600
Subject: [PATCH 24/51] Bring back support for kernel specialization.

   - Kernel specialization for specific signatures was fixed.
   - Specialization is only allowed for usm_ndarray.
   - Dispacther class was renamed as JitKernel.
   - decorators updated.
---
 numba_dpex/core/exceptions.py                 |  31 ++-
 .../core/kernel_interface/dispatcher.py       | 229 +++++++++++++-----
 numba_dpex/decorators.py                      |  94 ++++---
 numba_dpex/device_init.py                     |   2 +-
 numba_dpex/tests/kernel_tests/test_caching.py |   4 +-
 5 files changed, 233 insertions(+), 127 deletions(-)

diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index 11098aae33..99791bb931 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -23,14 +23,20 @@ class KernelHasReturnValueError(Exception):
         the kernel function.
     """
 
-    def __init__(self, kernel_name, return_type) -> None:
+    def __init__(self, kernel_name, return_type, sig=None) -> None:
         self.return_type = return_type
-        self.kernel_name = kernel_name
-        self.message = (
-            f'Kernel "{self.kernel_name}" has a return value '
-            f'of type "{self.return_type}". '
-            "A numba-dpex kernel must have a void return type."
-        )
+        if sig:
+            self.message = (
+                f'Specialized kernel signature "{sig}" has a return value '
+                f'of type "{return_type}". '
+                "A numba-dpex kernel must have a void return type."
+            )
+        else:
+            self.message = (
+                f'Kernel "{kernel_name}" has a return value '
+                f'of type "{return_type}". '
+                "A numba-dpex kernel must have a void return type."
+            )
 
         super().__init__(self.message)
 
@@ -321,3 +327,14 @@ def __init__(self) -> None:
             'The dpex compiler does not support the "force_pyobject" setting.'
         )
         super().__init__(self.message)
+
+
+class InvalidKernelSpecializationError(Exception):
+    def __init__(
+        self, kernel_name, invalid_sig, unsupported_argnum_list
+    ) -> None:
+        unsupported = ",".join([str(i) for i in unsupported_argnum_list])
+        self.message = f"Kernel {kernel_name} cannot be specialized for "
+        f'"{invalid_sig}". Arguments {unsupported} are not supported.'
+
+        super().__init__(self.message)
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 26a8c5866a..78cb2f31c6 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -9,8 +9,9 @@
 
 import dpctl
 import dpctl.program as dpctl_prog
-from numba.core import utils
-from numba.core.types import Array as ArrayType
+from numba.core import sigutils, types, utils
+from numba.core.types import Array as NpArrayType
+from numba.core.types import void
 
 from numba_dpex import config
 from numba_dpex.core.caching import LRUCache, NullCache, build_key
@@ -20,7 +21,8 @@
     ExecutionQueueInferenceError,
     IllegalRangeValueError,
     InvalidKernelLaunchArgsError,
-    SUAIProtocolError,
+    InvalidKernelSpecializationError,
+    KernelHasReturnValueError,
     UnknownGlobalRangeError,
     UnsupportedBackendError,
     UnsupportedNumberOfRangeDimsError,
@@ -46,9 +48,15 @@ def get_ordered_arg_access_types(pyfunc, access_types):
     return ordered_arg_access_types
 
 
-class Dispatcher(object):
-    """Creates a Kernel object from a @kernel decorated function and enqueues
-    the Kernel object on a specified device.
+class JitKernel:
+    """An abstract function object wrapping a concrete device kernel function.
+
+    A JitKernel is returned by the kernel decorator and wraps an instance of a
+    device kernel function. A device kernel function is specialized for a
+    backend may represent a binary object in a lower-level IR. Currently, only
+    SPIR-V binary format device functions for level-zero and opencl backends
+    are supported.
+
     """
 
     # The list of SYCL backends supported by the Dispatcher
@@ -60,6 +68,7 @@ def __init__(
         debug_flags=None,
         compile_flags=None,
         array_access_specifiers=None,
+        specialization_sigs=None,
         enable_cache=True,
     ):
         self.typingctx = dpex_target.typing_context
@@ -67,9 +76,11 @@ def __init__(
         self.debug_flags = debug_flags
         self.compile_flags = compile_flags
         self.kernel_name = pyfunc.__name__
+
         # TODO: To be removed once the__getitem__ is removed
         self._global_range = None
         self._local_range = None
+
         # caching related attributes
         if not config.ENABLE_CACHE:
             self._cache = NullCache()
@@ -99,6 +110,12 @@ def __init__(
         else:
             self._create_sycl_kernel_bundle_flags = []
 
+        # Specialization of kernel based on signatures. If specialization
+        # signatures are found, they are compiled ahead of time and cached.
+        if specialization_sigs:
+            for sig in specialization_sigs:
+                self._specialize(sig)
+
     @property
     def cache(self):
         return self._cache
@@ -107,14 +124,92 @@ def cache(self):
     def cache_hits(self):
         return self._cache_hits
 
-    def enable_caching(self):
-        if not config.ENABLE_CACHE:
-            self._cache = NullCache()
-        else:
-            self._cache = LRUCache(
-                capacity=config.CACHE_SIZE, pyfunc=self.pyfunc
+    def _compile_and_cache(self, argtypes, sig, backend, device_type):
+        kernel = SpirvKernel(self.pyfunc, self.kernel_name)
+        kernel.compile(
+            arg_types=argtypes,
+            debug=self.debug_flags,
+            extra_compile_flags=self.compile_flags,
+        )
+
+        device_driver_ir_module = kernel.device_driver_ir_module
+        kernel_module_name = kernel.module_name
+
+        key = build_key(
+            sig,
+            tuple(argtypes),
+            self.pyfunc,
+            kernel.target_context.codegen(),
+            backend=backend,
+            device_type=device_type,
+        )
+        self._cache.put(key, (device_driver_ir_module, kernel_module_name))
+
+        return device_driver_ir_module, kernel_module_name
+
+    def _specialize(self, sig):
+        """Compiles a device kernel ahead of time based on provided argtypes.
+
+        Args:
+            sig (_type_): _description_
+        """
+
+        argtypes, return_type = sigutils.normalize_signature(sig)
+
+        # Check if signature has a non-void return type
+        if return_type and return_type != void:
+            raise KernelHasReturnValueError(
+                kernel_name=None, return_type=return_type, sig=sig
             )
 
+        # USMNdarray check
+        usmarray_argnums = []
+        usmndarray_argtypes = []
+        unsupported_argnum_list = []
+
+        for i, argtype in enumerate(argtypes):
+            # FIXME: Add checks for other types of unsupported kernel args, e.g.
+            # complex.
+
+            # Check if a non-USMNdArray Array type is passed to the kernel
+            if isinstance(argtype, NpArrayType) and not isinstance(
+                argtype, USMNdArray
+            ):
+                unsupported_argnum_list.append(i)
+            elif isinstance(argtype, USMNdArray):
+                usmarray_argnums.append(i)
+                usmndarray_argtypes.append(argtype)
+
+        if unsupported_argnum_list:
+            raise InvalidKernelSpecializationError(
+                kernel_name=self.kernel_name,
+                invalid_sig=sig,
+                unsupported_argnum_list=unsupported_argnum_list,
+            )
+
+        # CFD check and get the execution queue
+        device = self._chk_compute_follows_data_compliance(usmndarray_argtypes)
+        if not device:
+            raise ComputeFollowsDataInferenceError(
+                self.kernel_name, usmarray_argnum_list=usmarray_argnums
+            )
+
+        if device.backend not in [
+            dpctl.backend_type.opencl,
+            dpctl.backend_type.level_zero,
+        ]:
+            raise UnsupportedBackendError(
+                self.kernel_name, device.backend, JitKernel._supported_backends
+            )
+
+        # compile and cache the kernel
+        self._compile_and_cache(
+            argtypes=argtypes,
+            sig=sig,
+            backend=device.backend,
+            device_type=device.device_type,
+        )
+
     def _check_range(self, range, device):
 
         if not isinstance(range, (tuple, list)):
@@ -140,30 +235,42 @@ def _check_ndrange(self, global_range, local_range, device):
         #         )
         pass
 
-    def _determine_compute_follows_data_queue(self, usm_array_list):
-        """Determine the execution queue for the list of usm array args using
-        compute follows data programming model.
+    def _chk_compute_follows_data_compliance(self, usm_array_arglist):
+        """Check if all the usm ndarray's have the same device.
+
+        Extracts the device filter string from the Numba inferred USMNdArray
+        type. Check if the devices corresponding to the filter string are
+        equivalent and return a ``dpctl.SyclDevice`` object corresponding to the
+        common filter string.
 
-        Uses ``dpctl.utils.get_execution_queue()`` to check if the list of
-        queues belonging to the usm_ndarrays are equivalent. If the queues are
-        equivalent, then returns the queue. If the queues are not equivalent
-        then returns None.
+        If an exception occurred in creating a ``dpctl.SyclDevice``, or the
+        devices are not equivalent then returns None.
 
         Args:
-            usm_array_list : A list of usm_ndarray objects
+            usm_array_arglist : A list of usm_ndarray types specified as
+            arguments to the kernel.
 
         Returns:
-            A queue the common queue used to allocate the arrays. If no such
-            queue exists, then returns None.
+            A ``dpctl.SyclDevice`` object if all USMNdArray have same device, or
+            else None is returned.
         """
-        queues = []
-        for usm_array in usm_array_list:
+
+        device = None
+
+        for usm_array in usm_array_arglist:
+            filter_str = usm_array.device
             try:
-                q = usm_array.__sycl_usm_array_interface__["syclobj"]
-                queues.append(q)
-            except:
-                raise SUAIProtocolError(self.kernel_name, usm_array)
-        return dpctl.utils.get_execution_queue(queues)
+                _device = dpctl.SyclDevice(filter_str)
+            except Exception as e:
+                print(e)
+                return None
+            if not device:
+                device = _device
+            else:
+                if _device != device:
+                    return None
+
+        return device
 
     def _determine_kernel_launch_queue(self, args, argtypes):
         """Determines the queue where the kernel is to be launched.
@@ -223,17 +330,18 @@ def _determine_kernel_launch_queue(self, args, argtypes):
             ExecutionQueueInferenceError: If the queue could not be inferred
                 using the dpctl queue manager.
         """
-        # Temporary workaround as USMNdArray derives from Array
+
+        # FIXME: The args parameter is not needed once numpy support is removed
+
+        # Needed as USMNdArray derives from Array
         array_argnums = [
             i
-            for i, arg in enumerate(args)
-            if isinstance(argtypes[i], ArrayType)
+            for i, _ in enumerate(args)
+            if isinstance(argtypes[i], NpArrayType)
             and not isinstance(argtypes[i], USMNdArray)
         ]
         usmarray_argnums = [
-            i
-            for i, arg in enumerate(args)
-            if isinstance(argtypes[i], USMNdArray)
+            i for i, _ in enumerate(args) if isinstance(argtypes[i], USMNdArray)
         ]
 
         # if usm and non-usm array arguments are getting mixed, then the
@@ -269,15 +377,19 @@ def _determine_kernel_launch_queue(self, args, argtypes):
                     + "are dpctl.tensor.usm_ndarray based array containers."
                 )
             usm_array_args = [
-                arg for i, arg in enumerate(args) if i in usmarray_argnums
+                argtype
+                for i, argtype in enumerate(argtypes)
+                if i in usmarray_argnums
             ]
-            queue = self._determine_compute_follows_data_queue(usm_array_args)
-            if not queue:
+
+            device = self._chk_compute_follows_data_compliance(usm_array_args)
+
+            if not device:
                 raise ComputeFollowsDataInferenceError(
                     self.kernel_name, usmarray_argnum_list=usmarray_argnums
                 )
             else:
-                return queue
+                return dpctl.SyclQueue(device)
         else:
             if dpctl.is_in_device_context():
                 warn(
@@ -409,8 +521,8 @@ def _get_ranges(self, global_range, local_range, device):
         # invoked using a SYCL nd_range
         if global_range and not local_range:
             self._check_range(global_range, device)
-            # FIXME:[::-1] is done as OpenCL and SYCl have different orders when it
-            # comes to specifying dimensions.
+            # FIXME:[::-1] is done as OpenCL and SYCl have different orders when
+            # it comes to specifying dimensions.
             global_range = list(global_range)[::-1]
         else:
             if isinstance(local_range, int):
@@ -433,7 +545,9 @@ def __call__(self, *args, global_range=None, local_range=None):
             local_range (_type_): _description_.
         """
         argtypes = [self.typingctx.resolve_argument_type(arg) for arg in args]
-
+        # FIXME: For specialized and ahead of time compiled and cached kernels,
+        # the CFD check was already done statically. The run-time check is
+        # redundant. We should avoid these checks for the specialized case.
         exec_queue = self._determine_kernel_launch_queue(args, argtypes)
         backend = exec_queue.backend
         device_type = exec_queue.sycl_device.device_type
@@ -443,7 +557,7 @@ def __call__(self, *args, global_range=None, local_range=None):
             dpctl.backend_type.level_zero,
         ]:
             raise UnsupportedBackendError(
-                self.kernel_name, backend, Dispatcher._supported_backends
+                self.kernel_name, backend, JitKernel._supported_backends
             )
 
         # TODO: Refactor after __getitem__ is removed
@@ -451,9 +565,6 @@ def __call__(self, *args, global_range=None, local_range=None):
             global_range, local_range, exec_queue.sycl_device
         )
 
-        # TODO: Enable caching of kernels, but do it using LRU
-        # caching and numba's pickle framework.
-
         # load the kernel from cache
         sig = utils.pysignature(self.pyfunc)
         key = build_key(
@@ -470,25 +581,15 @@ def __call__(self, *args, global_range=None, local_range=None):
             device_driver_ir_module, kernel_module_name = artifact
             self._cache_hits += 1
         else:
-            kernel = SpirvKernel(self.pyfunc, self.kernel_name)
-            kernel.compile(
-                arg_types=argtypes,
-                debug=self.debug_flags,
-                extra_compile_flags=self.compile_flags,
-            )
-
-            device_driver_ir_module = kernel.device_driver_ir_module
-            kernel_module_name = kernel.module_name
-
-            key = build_key(
-                sig,
-                tuple(argtypes),
-                self.pyfunc,
-                kernel.target_context.codegen(),
+            (
+                device_driver_ir_module,
+                kernel_module_name,
+            ) = self._compile_and_cache(
+                argtypes=argtypes,
+                sig=sig,
                 backend=backend,
                 device_type=device_type,
             )
-            self._cache.put(key, (device_driver_ir_module, kernel_module_name))
 
         # create a sycl::KernelBundle
         kernel_bundle = dpctl_prog.create_program_from_spirv(
@@ -497,7 +598,7 @@ def __call__(self, *args, global_range=None, local_range=None):
             " ".join(self._create_sycl_kernel_bundle_flags),
         )
         #  get the sycl::kernel
-        kernel = kernel_bundle.get_sycl_kernel(kernel_module_name)
+        sycl_kernel = kernel_bundle.get_sycl_kernel(kernel_module_name)
 
         packer = Packer(
             kernel_name=self.kernel_name,
@@ -508,7 +609,7 @@ def __call__(self, *args, global_range=None, local_range=None):
         )
 
         exec_queue.submit(
-            kernel,
+            sycl_kernel,
             packer.unpacked_args,
             global_range,
             local_range,
diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py
index efb05b1c2e..9eff678108 100644
--- a/numba_dpex/decorators.py
+++ b/numba_dpex/decorators.py
@@ -2,12 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import dpctl
+from warnings import warn
+
 from numba.core import sigutils, types
 
-from numba_dpex.compiler import JitKernel
 from numba_dpex.core.kernel_interface.dispatcher import (
-    Dispatcher,
+    JitKernel,
     get_ordered_arg_access_types,
 )
 from numba_dpex.core.kernel_interface.func import (
@@ -17,71 +17,59 @@
 from numba_dpex.utils import npytypes_array_to_dpex_array
 
 
-def kernel(func_or_sig=None, access_types=None, debug=None, enable_cache=True):
-    """The decorator to write a numba_dpex kernel function.
+def kernel(
+    func_or_sig=None,
+    access_types=None,
+    debug=None,
+    enable_cache=True,
+):
+    """A decorator to define a kernel function.
 
     A kernel function is conceptually equivalent to a SYCL kernel function, and
     gets compiled into either an OpenCL or a LevelZero SPIR-V binary kernel.
-    A dpex kernel imposes the following restrictions:
+    A kernel decorated Python function has the following restrictions:
 
-        * A numba_dpex.kernel function can not return any value.
-        * All array arguments passed to a kernel should be of the same type
-          and have the same dtype.
+        * The function can not return any value.
+        * All array arguments passed to a kernel should adhere to compute
+          follows data programming model.
     """
-    if func_or_sig is None:
-        return autojit(
-            debug=debug, access_types=access_types, enable_cache=enable_cache
-        )
-    elif not sigutils.is_signature(func_or_sig):
-        func = func_or_sig
-        return autojit(
-            debug=debug, access_types=access_types, enable_cache=enable_cache
-        )(func)
-    else:
-        return _kernel_jit(
-            func_or_sig, debug, access_types, enable_cache=enable_cache
-        )
-
 
-def autojit(debug=None, access_types=None, enable_cache=True):
-    def _kernel_dispatcher(pyfunc):
+    def _kernel_dispatcher(pyfunc, sigs=None):
         ordered_arg_access_types = get_ordered_arg_access_types(
             pyfunc, access_types
         )
-        return Dispatcher(
+        return JitKernel(
             pyfunc=pyfunc,
             debug_flags=debug,
             array_access_specifiers=ordered_arg_access_types,
             enable_cache=enable_cache,
+            specialization_sigs=sigs,
         )
 
-    return _kernel_dispatcher
-
-
-def _kernel_jit(signature, debug, access_types, enable_cache=True):
-    argtypes, rettype = sigutils.normalize_signature(signature)
-    argtypes = tuple(
-        [
-            npytypes_array_to_dpex_array(ty)
-            if isinstance(ty, types.npytypes.Array)
-            else ty
-            for ty in argtypes
-        ]
-    )
-
-    def _wrapped(pyfunc):
-        current_queue = dpctl.get_current_queue()
-        ordered_arg_access_types = get_ordered_arg_access_types(
-            pyfunc, access_types
-        )
-        # We create an instance of JitKernel to make sure at call time
-        # we are going through the caching mechanism.
-        kernel = JitKernel(pyfunc, debug, ordered_arg_access_types)
-        # This will make sure we are compiling eagerly.
-        kernel.specialize(argtypes, current_queue)
-        return kernel
-
-    return _wrapped
+    if func_or_sig is None:
+        return _kernel_dispatcher
+    elif not sigutils.is_signature(func_or_sig):
+        func = func_or_sig
+        return _kernel_dispatcher(func)
+    else:
+        # Specialized signatures can either be a single signature or a list.
+        # In case only one signature is provided convert it to a list
+        if not isinstance(func_or_sig, list):
+            func_or_sig = [func_or_sig]
+
+        def _specialized_kernel_dispatcher(pyfunc):
+            ordered_arg_access_types = get_ordered_arg_access_types(
+                pyfunc, access_types
+            )
+            return JitKernel(
+                pyfunc=pyfunc,
+                debug_flags=debug,
+                array_access_specifiers=ordered_arg_access_types,
+                enable_cache=enable_cache,
+                specialization_sigs=func_or_sig,
+            )
+
+        return _specialized_kernel_dispatcher
 
 
 def func(signature=None, debug=None):
diff --git a/numba_dpex/device_init.py b/numba_dpex/device_init.py
index 3452ffdcb9..ad1ed3457a 100644
--- a/numba_dpex/device_init.py
+++ b/numba_dpex/device_init.py
@@ -33,6 +33,6 @@
 
 from . import initialize
 from .core import target
-from .decorators import autojit, func, kernel
+from .decorators import func, kernel
 
 initialize.load_dpctl_sycl_interface()
diff --git a/numba_dpex/tests/kernel_tests/test_caching.py b/numba_dpex/tests/kernel_tests/test_caching.py
index 25385fa3a8..58c7fa1ed4 100644
--- a/numba_dpex/tests/kernel_tests/test_caching.py
+++ b/numba_dpex/tests/kernel_tests/test_caching.py
@@ -9,7 +9,7 @@
 
 import numba_dpex as dpex
 from numba_dpex.core.kernel_interface.dispatcher import (
-    Dispatcher,
+    JitKernel,
     get_ordered_arg_access_types,
 )
 from numba_dpex.tests._helper import filter_strings
@@ -41,7 +41,7 @@ def data_parallel_sum(x, y, z):
 
     expected = dpt.asnumpy(a) + dpt.asnumpy(b)
 
-    d = Dispatcher(
+    d = JitKernel(
         data_parallel_sum,
         array_access_specifiers=get_ordered_arg_access_types(
             data_parallel_sum, None

From c70795be004bbeaf5e7557ea8f6637ccac6c35bf Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 5 Jan 2023 01:28:08 -0600
Subject: [PATCH 25/51] Update driver.py with specialization.

---
 driver.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/driver.py b/driver.py
index 9f2b32ab60..002942dc1b 100644
--- a/driver.py
+++ b/driver.py
@@ -6,10 +6,13 @@
 import dpctl.tensor as dpt
 
 import numba_dpex as dpex
-from numba_dpex.core.kernel_interface.dispatcher import Dispatcher
+from numba_dpex import usm_ndarray
+from numba_dpex.core.kernel_interface.dispatcher import JitKernel
 
+arrty = usm_ndarray(int, 1, "C", "device", "gpu")
 
-@dpex.kernel
+
+@dpex.kernel((arrty, arrty, arrty))
 def data_parallel_sum(a, b, c):
     """
     Vector addition using the ``kernel`` decorator.

From b19d71558c442e271e5d34c17729be558337825b Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 5 Jan 2023 20:18:22 -0600
Subject: [PATCH 26/51] Fix driver

---
 driver.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver.py b/driver.py
index 002942dc1b..d31024fdaa 100644
--- a/driver.py
+++ b/driver.py
@@ -6,10 +6,10 @@
 import dpctl.tensor as dpt
 
 import numba_dpex as dpex
-from numba_dpex import usm_ndarray
+from numba_dpex import int64, usm_ndarray
 from numba_dpex.core.kernel_interface.dispatcher import JitKernel
 
-arrty = usm_ndarray(int, 1, "C", "device", "gpu")
+arrty = usm_ndarray(int64, 1, "C", "device", "level_zero:gpu:0")
 
 
 @dpex.kernel((arrty, arrty, arrty))

From 63490a829dcef2c7d9f9fd87642d72fef9a1186e Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 5 Jan 2023 21:56:19 -0600
Subject: [PATCH 27/51] Fix formatting issues.

---
 numba_dpex/core/caching.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/numba_dpex/core/caching.py b/numba_dpex/core/caching.py
index 1810c52b5d..88b71ebaaa 100644
--- a/numba_dpex/core/caching.py
+++ b/numba_dpex/core/caching.py
@@ -95,16 +95,18 @@ def check_cachable(self, cres):
         """Check if a certain object is cacheable.
 
         Args:
-            cres (object): The object to be cached. For example, if the object is
-            `CompileResult`, then you might want to follow the similar checks as
-            has been done in `numba.core.caching.CompileResultCacheImpl`.
+            cres (object): The object to be cached. For example, if the object
+            is `CompileResult`, then you might want to follow the similar
+            checks as has been done in
+            `numba.core.caching.CompileResultCacheImpl`.
 
         Returns:
             bool: Return `True` if cacheable, otherwise `False`.
         """
         # TODO: Although, for the time being, assuming all Kernels in numba_dpex
         # are always cachable. However, we might need to add some bells and
-        # whistles in the future. Look at numba.core.caching for how to implement.
+        # whistles in the future. Look at numba.core.caching for how to
+        # implement.
         return True
 
 
@@ -167,7 +169,8 @@ def put(self, key, value):
 
         Args:
             key (object): The key to the data (i.e. compiled kernel/function).
-            value (object): The data to be cached (i.e. compiled kernel/function).
+            value (object): The data to be cached (i.e.
+            compiled kernel/function).
         """
         pass
 
@@ -298,8 +301,8 @@ def _get_memsize(self, obj, seen=None):
         obj_id = id(obj)
         if obj_id in seen:
             return 0
-        # Important mark as seen *before* entering recursion to gracefully handle
-        # self-referential objects
+        # Important mark as seen *before* entering recursion to gracefully
+        # handle self-referential objects
         seen.add(obj_id)
         if isinstance(obj, dict):
             size += sum([self._get_memsize(v, seen) for v in obj.values()])
@@ -420,9 +423,8 @@ def get(self, key):
                 value = self._cache_file.load(key)
                 if config.DEBUG_CACHE:
                     print(
-                        "[cache]: unpickled an evicted artifact, key: {0:s}.".format(
-                            str(key)
-                        )
+                        "[cache]: unpickled an evicted artifact, "
+                        "key: {0:s}.".format(str(key))
                     )
             else:
                 value = self._evicted[key]
@@ -469,7 +471,8 @@ def put(self, key, value):
             if self._cache_file:
                 if config.DEBUG_CACHE:
                     print(
-                        "[cache] size: {0:d}, pickling the LRU item, key: {1:s}, indexed at {2:s}.".format(
+                        "[cache] size: {0:d}, pickling the LRU item, "
+                        "key: {1:s}, indexed at {2:s}.".format(
                             len(self._lookup),
                             str(self._head.key),
                             self._cache_file._index_path,

From 4d0dd198e298b579d7c24fb0808efa3810c1f71b Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 6 Jan 2023 01:39:20 -0600
Subject: [PATCH 28/51] Improvements to USMNdArrayType.

   - print a better type name with keywords and all attributes.
   - store a fully qualified filter string in the type.
   - set the default address space to GLOBAL instead of None.
   - fix the unify function to test on usm_type, device, address
     space.
---
 numba_dpex/core/types/usm_ndarray_type.py | 45 +++++++++++++++++++----
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/numba_dpex/core/types/usm_ndarray_type.py b/numba_dpex/core/types/usm_ndarray_type.py
index e156d7a5c7..0e4ba2a851 100644
--- a/numba_dpex/core/types/usm_ndarray_type.py
+++ b/numba_dpex/core/types/usm_ndarray_type.py
@@ -2,12 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+"""A type class to represent dpctl.tensor.usm_ndarray type in Numba
+"""
+
+import dpctl
 import dpctl.tensor
 from numba.core.typeconv import Conversion
 from numba.core.types.npytypes import Array
 
-"""A type class to represent dpctl.tensor.usm_ndarray type in Numba
-"""
+from numba_dpex.utils import address_space
 
 
 class USMNdArray(Array):
@@ -23,20 +26,35 @@ def __init__(
         readonly=False,
         name=None,
         aligned=True,
-        addrspace=None,
+        addrspace=address_space.GLOBAL,
     ):
         self.usm_type = usm_type
-        self.device = device
         self.addrspace = addrspace
 
+        # Normalize the device filter string and get the fully qualified three
+        # tuple (backend:device_type:device_num) filter string from dpctl.
+        _d = dpctl.SyclDevice(device)
+        self.device = _d.filter_string
+
         if name is None:
             type_name = "usm_ndarray"
             if readonly:
                 type_name = "readonly " + type_name
             if not aligned:
                 type_name = "unaligned " + type_name
-            name_parts = (type_name, dtype, ndim, layout, usm_type, device)
-            name = "%s(%s, %sd, %s, %s, %s)" % name_parts
+            name_parts = (
+                type_name,
+                dtype,
+                ndim,
+                layout,
+                self.addrspace,
+                usm_type,
+                self.device,
+            )
+            name = (
+                "%s(dtype=%s, ndim=%s, layout=%s, address_space=%s, "
+                "usm_type=%s, sycl_device=%s)" % name_parts
+            )
 
         super().__init__(
             dtype,
@@ -86,8 +104,16 @@ def unify(self, typingctx, other):
         """
         Unify this with the *other* USMNdArray.
         """
-        # If other is array and the ndim matches
-        if isinstance(other, USMNdArray) and other.ndim == self.ndim:
+        # If other is array and the ndim, usm_type, address_space, and device
+        # attributes match
+
+        if (
+            isinstance(other, USMNdArray)
+            and other.ndim == self.ndim
+            and self.device == other.device
+            and self.addrspace == other.addrspace
+            and self.usm_type == other.usm_type
+        ):
             # If dtype matches or other.dtype is undefined (inferred)
             if other.dtype == self.dtype or not other.dtype.is_precise():
                 if self.layout == other.layout:
@@ -102,6 +128,9 @@ def unify(self, typingctx, other):
                     layout=layout,
                     readonly=readonly,
                     aligned=aligned,
+                    usm_type=self.usm_type,
+                    device=self.device,
+                    addrspace=self.addrspace,
                 )
 
     def can_convert_to(self, typingctx, other):

From 1b6b45236acd36f8c100384b5feee1245f8f775f Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 6 Jan 2023 01:42:47 -0600
Subject: [PATCH 29/51] Remove the function signature from the cache composite
 key.

---
 numba_dpex/core/caching.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/numba_dpex/core/caching.py b/numba_dpex/core/caching.py
index 88b71ebaaa..3d011787c3 100644
--- a/numba_dpex/core/caching.py
+++ b/numba_dpex/core/caching.py
@@ -12,18 +12,19 @@
 from numba_dpex import config
 
 
-def build_key(sig, argtypes, pyfunc, codegen, backend=None, device_type=None):
-    """Constructs a key from python function, context, backend
-        and the device type.
+def build_key(argtypes, pyfunc, codegen, backend=None, device_type=None):
+    """Constructs a key from python function, context, backend and the device
+    type.
 
-    Compute index key for the given signature and codegen.
-    It includes a description of the OS, target architecture
-    and hashes of the bytecode for the function and, if the
-    function has a __closure__, a hash of the cell_contents.
+    Compute index key for the given argument types and codegen. It includes a
+    description of the OS, target architecture and hashes of the bytecode for
+    the function and, if the function has a __closure__, a hash of the
+    cell_contents.type
 
     Args:
-        sig (inspect.Signature): The signature object of
-            a python function.
+        argtypes : A tuple of numba types corresponding to the arguments to the
+        compiled function.
+        pyfunc : The Python function that is to be compiled and cached.
         codegen (numba.core.codegen.Codegen):
             The codegen object found from the target context.
         backend (enum, optional): A 'backend_type' enum.
@@ -32,9 +33,8 @@ def build_key(sig, argtypes, pyfunc, codegen, backend=None, device_type=None):
             Defaults to None.
 
     Returns:
-        tuple: A tuple of signature, magic_tuple of codegen
-            and another tuple of hashcodes from bytecode and
-            cell_contents.
+        tuple: A tuple of return type, argtpes, magic_tuple of codegen
+            and another tuple of hashcodes from bytecode and cell_contents.
     """
 
     codebytes = pyfunc.__code__.co_code
@@ -50,7 +50,6 @@ def build_key(sig, argtypes, pyfunc, codegen, backend=None, device_type=None):
         cvarbytes = b""
 
     return (
-        sig,
         argtypes,
         codegen.magic_tuple(),
         backend,

From 3e14764b2c06f12d2dd1fe9d9fabd05b28da2667 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 6 Jan 2023 01:50:50 -0600
Subject: [PATCH 30/51] Add unit tests for kernel specialization.

   - Added unit tests for the kernel specialization feature.
   - Fix kernel decorator to support list of signatures.
   - Disallow JIT compilation of a specialized kernel and raise
     an exception.
---
 numba_dpex/core/exceptions.py                 | 48 +++++++++--
 .../core/kernel_interface/dispatcher.py       | 61 +++++++++-----
 numba_dpex/decorators.py                      |  8 +-
 .../test_kernel_specialization.py             | 81 +++++++++++++++++++
 4 files changed, 168 insertions(+), 30 deletions(-)
 create mode 100644 numba_dpex/tests/kernel_tests/test_kernel_specialization.py

diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index 99791bb931..003dcc5792 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -314,9 +314,11 @@ class UnsupportedAccessQualifierError(Exception):
     def __init__(
         self, kernel_name, array_val, illegal_access_type, legal_access_list
     ) -> None:
-        self.message = f"Invalid access type {illegal_access_type} applied to "
-        f'array {array_val} argument passed to kernel "{kernel_name}". '
-        f"Legal access specifiers are {legal_access_list}."
+        self.message = (
+            f"Invalid access type {illegal_access_type} applied to "
+            f'array {array_val} argument passed to kernel "{kernel_name}". '
+            f"Legal access specifiers are {legal_access_list}."
+        )
 
         super().__init__(self.message)
 
@@ -330,11 +332,47 @@ def __init__(self) -> None:
 
 
 class InvalidKernelSpecializationError(Exception):
+    """Exception raised when a the specialization argument types are not
+    supported by the dpex kernel decorator.
+
+    The exception is raised whenever an unsupported kernel argument is
+    provided in the specialization signature passed to a dpex kernel decorator
+    instance. For example, dpex kernels require arrays to be of USMNdArray type
+    and no other Array type, such as NumPy ndarray, are supported. If the
+    signature has an non USMNdArray Array type the exception is raised.
+
+    Args:
+        kernel_name (str): Name of kernel where the error was raised.
+        invalid_sig: Unsupported signature.
+        unsupported_argnum_list : The list of argument numbers that are
+        unsupported.
+    """
+
     def __init__(
         self, kernel_name, invalid_sig, unsupported_argnum_list
     ) -> None:
         unsupported = ",".join([str(i) for i in unsupported_argnum_list])
-        self.message = f"Kernel {kernel_name} cannot be specialized for "
-        f'"{invalid_sig}". Arguments {unsupported} are not supported.'
+        self.message = (
+            f"Kernel {kernel_name} cannot be specialized for "
+            f'"{invalid_sig}". Arguments {unsupported} are not supported.'
+        )
+
+        super().__init__(self.message)
+
+
+class MissingSpecializationError(Exception):
+    """Exception raised when a specialized JitKernel was called with arguments
+     that do not match any of the specialized versions of the JitKernel.
+
+    Args:
+        kernel_name (str): Name of kernel where the error was raised.
+        sig: Unsupported argument types used to call a specialized JitKernel.
+    """
+
+    def __init__(self, kernel_name, argtypes) -> None:
+        self.message = (
+            f"No specialized version of the kernel {kernel_name} "
+            f"exists for argument types: {argtypes}."
+        )
 
         super().__init__(self.message)
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 78cb2f31c6..4c03f3b91c 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -23,6 +23,7 @@
     InvalidKernelLaunchArgsError,
     InvalidKernelSpecializationError,
     KernelHasReturnValueError,
+    MissingSpecializationError,
     UnknownGlobalRangeError,
     UnsupportedBackendError,
     UnsupportedNumberOfRangeDimsError,
@@ -113,8 +114,20 @@ def __init__(
         # Specialization of kernel based on signatures. If specialization
         # signatures are found, they are compiled ahead of time and cached.
         if specialization_sigs:
+            self._has_specializations = True
+            self._specialization_cache = LRUCache(
+                capacity=config.CACHE_SIZE, pyfunc=self.pyfunc
+            )
             for sig in specialization_sigs:
                 self._specialize(sig)
+            if self._specialization_cache.size() == 0:
+                raise AssertionError(
+                    "JitKernel could not be specialized for signatures: "
+                    + specialization_sigs
+                )
+        else:
+            self._has_specializations = False
+            self._specialization_cache = NullCache()
 
     @property
     def cache(self):
@@ -124,7 +137,7 @@ def cache(self):
     def cache_hits(self):
         return self._cache_hits
 
-    def _compile_and_cache(self, argtypes, sig, backend, device_type):
+    def _compile_and_cache(self, argtypes, backend, device_type, cache):
         kernel = SpirvKernel(self.pyfunc, self.kernel_name)
         kernel.compile(
             arg_types=argtypes,
@@ -136,14 +149,13 @@ def _compile_and_cache(self, argtypes, sig, backend, device_type):
         kernel_module_name = kernel.module_name
 
         key = build_key(
-            sig,
             tuple(argtypes),
             self.pyfunc,
             kernel.target_context.codegen(),
             backend=backend,
             device_type=device_type,
         )
-        self._cache.put(key, (device_driver_ir_module, kernel_module_name))
+        cache.put(key, (device_driver_ir_module, kernel_module_name))
 
         return device_driver_ir_module, kernel_module_name
 
@@ -201,13 +213,12 @@ def _specialize(self, sig):
             raise UnsupportedBackendError(
                 self.kernel_name, device.backend, JitKernel._supported_backends
             )
-
         # compile and cache the kernel
         self._compile_and_cache(
             argtypes=argtypes,
-            sig=sig,
             backend=device.backend,
             device_type=device.device_type,
+            cache=self._specialization_cache,
         )
 
     def _check_range(self, range, device):
@@ -566,30 +577,38 @@ def __call__(self, *args, global_range=None, local_range=None):
         )
 
         # load the kernel from cache
-        sig = utils.pysignature(self.pyfunc)
         key = build_key(
-            sig,
             tuple(argtypes),
             self.pyfunc,
             dpex_target.target_context.codegen(),
             backend=backend,
             device_type=device_type,
         )
-        artifact = self._cache.get(key)
-        # if it's not cached, i.e. first time
-        if artifact is not None:
-            device_driver_ir_module, kernel_module_name = artifact
-            self._cache_hits += 1
+
+        # If the JitKernel was specialized then raise exception if argtypes
+        # do not match one of the specialized versions.
+        if self._has_specializations:
+            artifact = self._specialization_cache.get(key)
+            if artifact is not None:
+                device_driver_ir_module, kernel_module_name = artifact
+            else:
+                raise MissingSpecializationError(self.kernel_name, argtypes)
         else:
-            (
-                device_driver_ir_module,
-                kernel_module_name,
-            ) = self._compile_and_cache(
-                argtypes=argtypes,
-                sig=sig,
-                backend=backend,
-                device_type=device_type,
-            )
+            artifact = self._cache.get(key)
+            # if the kernel was not previously cached, compile it.
+            if artifact is not None:
+                device_driver_ir_module, kernel_module_name = artifact
+                self._cache_hits += 1
+            else:
+                (
+                    device_driver_ir_module,
+                    kernel_module_name,
+                ) = self._compile_and_cache(
+                    argtypes=argtypes,
+                    backend=backend,
+                    device_type=device_type,
+                    cache=self._cache,
+                )
 
         # create a sycl::KernelBundle
         kernel_bundle = dpctl_prog.create_program_from_spirv(
diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py
index 9eff678108..22db0a88cc 100644
--- a/numba_dpex/decorators.py
+++ b/numba_dpex/decorators.py
@@ -48,10 +48,7 @@ def _kernel_dispatcher(pyfunc, sigs=None):
 
     if func_or_sig is None:
         return _kernel_dispatcher
-    elif not sigutils.is_signature(func_or_sig):
-        func = func_or_sig
-        return _kernel_dispatcher(func)
-    else:
+    elif isinstance(func_or_sig, list) or sigutils.is_signature(func_or_sig):
         # Specialized signatures can either be a single signature or a list.
         # In case only one signature is provided convert it to a list
         if not isinstance(func_or_sig, list):
@@ -70,6 +67,9 @@ def _specialized_kernel_dispatcher(pyfunc):
             )
 
         return _specialized_kernel_dispatcher
+    else:
+        func = func_or_sig
+        return _kernel_dispatcher(func)
 
 
 def func(signature=None, debug=None):
diff --git a/numba_dpex/tests/kernel_tests/test_kernel_specialization.py b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
new file mode 100644
index 0000000000..420612a896
--- /dev/null
+++ b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl.tensor as dpt
+import pytest
+
+import numba_dpex as dpex
+from numba_dpex import float32, int64, usm_ndarray
+from numba_dpex.core.exceptions import (
+    InvalidKernelSpecializationError,
+    MissingSpecializationError,
+)
+
+i64arrty = usm_ndarray(int64, 1, "C", usm_type="device", device="0")
+f32arrty = usm_ndarray(float32, 1, "C", usm_type="device", device="0")
+
+specialized_kernel1 = dpex.kernel((i64arrty, i64arrty, i64arrty))
+specialized_kernel2 = dpex.kernel(
+    [(i64arrty, i64arrty, i64arrty), (f32arrty, f32arrty, f32arrty)]
+)
+
+
+def data_parallel_sum(a, b, c):
+    """
+    Vector addition using the ``kernel`` decorator.
+    """
+    i = dpex.get_global_id(0)
+    c[i] = a[i] + b[i]
+
+
+def test_single_specialization():
+    """Test if a kernel can be specialized with a single signature."""
+    jitkernel = specialized_kernel1(data_parallel_sum)
+    assert jitkernel._specialization_cache.size() == 1
+
+
+def test_multiple_specialization():
+    """Test if a kernel can be specialized with multiple signatures."""
+    jitkernel = specialized_kernel2(data_parallel_sum)
+    assert jitkernel._specialization_cache.size() == 2
+
+
+def test_invalid_specialization_error():
+    """Test if an InvalidKernelSpecializationError is raised when attempting to
+    specialize with NumPy arrays.
+    """
+    specialized_kernel3 = dpex.kernel((int64[::1], int64[::1], int64[::1]))
+    with pytest.raises(InvalidKernelSpecializationError):
+        specialized_kernel3(data_parallel_sum)
+
+
+def test_missing_specialization_error():
+    """Test if a MissingSpecializationError is raised when calling a
+    specialized kernel with unsupported arguments.
+    """
+    a = dpt.ones(1024, dtype=dpt.int32)
+    b = dpt.ones(1024, dtype=dpt.int32)
+    c = dpt.zeros(1024, dtype=dpt.int32)
+
+    with pytest.raises(MissingSpecializationError):
+        specialized_kernel1(data_parallel_sum)[
+            1024,
+        ](a, b, c)
+
+
+def test_execution_of_specialized_kernel():
+    """Test if the specialized kernel is correctly executed."""
+    a = dpt.ones(1024, dtype=dpt.int64, device="0")
+    b = dpt.ones(1024, dtype=dpt.int64, device="0")
+    c = dpt.zeros(1024, dtype=dpt.int64, device="0")
+
+    specialized_kernel1(data_parallel_sum)[
+        1024,
+    ](a, b, c)
+
+    npc = dpt.asnumpy(c)
+    import numpy as np
+
+    npc_expected = np.full(1024, 2, dtype=np.int64)
+    assert np.array_equal(npc, npc_expected)

From 539adffd75251b52f972c59d53dcb5565122f4b9 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 6 Jan 2023 12:00:42 -0600
Subject: [PATCH 31/51] Mark all vectorize tests as xfail.

---
 numba_dpex/tests/test_vectorize.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba_dpex/tests/test_vectorize.py b/numba_dpex/tests/test_vectorize.py
index af00468cf7..dfd9a692d7 100644
--- a/numba_dpex/tests/test_vectorize.py
+++ b/numba_dpex/tests/test_vectorize.py
@@ -23,6 +23,7 @@ def shape(request):
     return request.param
 
 
+@pytest.mark.xfail
 @pytest.mark.parametrize("filter_str", filter_strings)
 def test_njit(filter_str):
     @vectorize(nopython=True)
@@ -66,6 +67,7 @@ def input_type(request):
     return request.param
 
 
+@pytest.mark.xfail
 @pytest.mark.parametrize("filter_str", filter_strings)
 def test_vectorize(filter_str, shape, dtypes, input_type):
     def vector_add(a, b):

From 7d35b62626242603a85db85daf659f19176c4e9b Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 7 Jan 2023 03:06:32 -0600
Subject: [PATCH 32/51] Improve support for SUAI arrays as kernel arguments.

    - Move sycl_usm_array_interface helper functions into core/utils.
    - Move array type conversion function for unknown type with
      sycl_usm_array_interface to UsmNdArray into core/typeconv.
    - Fix how the UsmNdArray type is constructed for unknow
      sycl_usm_array_interface arrays.
---
 numba_dpex/compiler.py                        |   2 +-
 numba_dpex/core/exceptions.py                 |   6 +-
 .../kernel_interface/arg_pack_unpacker.py     |  90 +++--------
 numba_dpex/core/target.py                     |  29 ++--
 numba_dpex/core/typeconv/__init__.py          |   7 +
 numba_dpex/core/typeconv/array_conversion.py  |  45 ++++++
 numba_dpex/core/utils/__init__.py             |  10 ++
 numba_dpex/core/utils/suai_helper.py          | 153 ++++++++++++++++++
 .../test_sycl_usm_array_iface_interop.py      |   2 +-
 numba_dpex/utils/__init__.py                  |   8 +-
 numba_dpex/utils/array_utils.py               |  34 ----
 numba_dpex/utils/type_conversion_fns.py       |  48 +-----
 12 files changed, 265 insertions(+), 169 deletions(-)
 create mode 100644 numba_dpex/core/typeconv/__init__.py
 create mode 100644 numba_dpex/core/typeconv/array_conversion.py
 create mode 100644 numba_dpex/core/utils/__init__.py
 create mode 100644 numba_dpex/core/utils/suai_helper.py

diff --git a/numba_dpex/compiler.py b/numba_dpex/compiler.py
index 4c01f00c11..3faa8981ae 100644
--- a/numba_dpex/compiler.py
+++ b/numba_dpex/compiler.py
@@ -19,6 +19,7 @@
 from numba_dpex.core.compiler import Compiler
 from numba_dpex.core.exceptions import KernelHasReturnValueError
 from numba_dpex.core.types import Array, USMNdArray
+from numba_dpex.core.utils import get_info_from_suai
 from numba_dpex.dpctl_support import dpctl_version
 from numba_dpex.utils import (
     IndeterminateExecutionQueueError,
@@ -26,7 +27,6 @@
     cfd_ctx_mgr_wrng_msg,
     copy_from_numpy_to_usm_obj,
     copy_to_numpy_from_usm_obj,
-    get_info_from_suai,
     has_usm_memory,
     mix_datatype_err_msg,
 )
diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index 003dcc5792..558fd141b0 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -270,12 +270,14 @@ class UnsupportedKernelArgumentError(Exception):
     the compiler.
 
     Args:
+        type (str): The type of the unsupported argument.
+        value (object): The Python object passed as a kernel argument.
         kernel_name (str): Name of kernel where the error was raised.
     """
 
-    def __init__(self, type, value, kernel_name) -> None:
+    def __init__(self, type, value, kernel_name="") -> None:
         self.message = (
-            f'Argument {value} passed to kernel "{kernel_name}" is of an '
+            f"Argument {value} passed to kernel {kernel_name} is of an "
             f"unsupported type ({type})."
         )
         super().__init__(self.message)
diff --git a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
index 40c3babb89..f734b02f97 100644
--- a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
+++ b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
@@ -16,6 +16,7 @@
     UnsupportedKernelArgumentError,
 )
 from numba_dpex.core.types import USMNdArray
+from numba_dpex.core.utils import SyclUSMArrayInterface, get_info_from_suai
 
 
 class _NumPyArrayPackerPayload:
@@ -40,53 +41,6 @@ def _check_for_invalid_access_type(self, array_val, access_type):
                 ",".join(Packer._access_types),
             )
 
-    def _get_info_from_suai(self, obj):
-        """
-        Extracts the metadata of an arrya-like object that provides a
-        __sycl_usm_array_interface__ (SUAI) attribute.
-
-        The ``dpctl.memory.as_usm_memory`` function converts the array-like
-        object into a dpctl.memory.USMMemory object. Using the ``as_usm_memory``
-        is an implicit way to verify if the array-like object is a legal
-        SYCL USM memory back Python object that can be passed to a dpex kernel.
-
-        Args:
-            obj: array-like object with a SUAI attribute.
-
-        Returns:
-            usm_mem: USM memory object.
-            total_size: Total number of items in the array.
-            shape: Shape of the array.
-            ndim: Total number of dimensions.
-            itemsize: Size of each item.
-            strides: Stride of the array.
-            dtype: Dtype of the array.
-        """
-        try:
-            usm_mem = dpctl_mem.as_usm_memory(obj)
-        except Exception:
-            logging.exception(
-                "array-like object does not implement the SUAI protocol."
-            )
-            raise SUAIProtocolError(self._pyfunc_name, obj)
-
-        shape = obj.__sycl_usm_array_interface__["shape"]
-        total_size = np.prod(obj.__sycl_usm_array_interface__["shape"])
-        ndim = len(obj.__sycl_usm_array_interface__["shape"])
-        itemsize = np.dtype(
-            obj.__sycl_usm_array_interface__["typestr"]
-        ).itemsize
-        dtype = np.dtype(obj.__sycl_usm_array_interface__["typestr"])
-        strides = obj.__sycl_usm_array_interface__["strides"]
-
-        if strides is None:
-            strides = [1] * ndim
-            for i in reversed(range(1, ndim)):
-                strides[i - 1] = strides[i] * shape[i]
-            strides = tuple(strides)
-
-        return usm_mem, total_size, shape, ndim, itemsize, strides, dtype
-
     def _unpack_array_helper(self, size, itemsize, buf, shape, strides, ndim):
         """
         Implements the unpacking logic for array arguments.
@@ -105,11 +59,11 @@ def _unpack_array_helper(self, size, itemsize, buf, shape, strides, ndim):
         """
         unpacked_array_attrs = []
 
-        # meminfo (FIXME: should be removed and the USMArrayType modified once
-        # NumPy support is removed)
+        # meminfo (FIXME: should be removed and the USMNdArray type modified
+        # once NumPy support is removed)
         unpacked_array_attrs.append(ctypes.c_size_t(0))
-        # meminfo (FIXME: Evaluate if the attribute should be removed and the
-        # USMArrayType modified once NumPy support is removed)
+        # parent (FIXME: Evaluate if the attribute should be removed and the
+        # USMNdArray type modified once NumPy support is removed)
         unpacked_array_attrs.append(ctypes.c_size_t(0))
         unpacked_array_attrs.append(ctypes.c_longlong(size))
         unpacked_array_attrs.append(ctypes.c_longlong(itemsize))
@@ -122,26 +76,30 @@ def _unpack_array_helper(self, size, itemsize, buf, shape, strides, ndim):
         return unpacked_array_attrs
 
     def _unpack_usm_array(self, val):
-        (
-            usm_mem,
-            total_size,
-            shape,
-            ndim,
-            itemsize,
-            strides,
-            dtype,
-        ) = self._get_info_from_suai(val)
+        """Flattens an object of USMNdArray type into ctypes objects to be
+        passed as kernel arguments.
+
+        Args:
+            val : An object of dpctl.types.UsmNdArray type.
+
+        Returns:
+            _type_: _description_
+        """
+        suai_attrs = get_info_from_suai(val)
 
         return self._unpack_array_helper(
-            total_size,
-            itemsize,
-            usm_mem,
-            shape,
-            strides,
-            ndim,
+            size=suai_attrs.size,
+            itemsize=suai_attrs.itemsize,
+            buf=suai_attrs.data,
+            shape=suai_attrs.shape,
+            strides=suai_attrs.strides,
+            ndim=suai_attrs.dimensions,
         )
 
     def _unpack_array(self, val, access_type):
+        """Deprecated to be removed once NumPy array support in kernels is
+        removed.
+        """
         packed_val = val
         # Check if the NumPy array is backed by USM memory
         usm_mem = utils.has_usm_memory(val)
diff --git a/numba_dpex/core/target.py b/numba_dpex/core/target.py
index 7afb88efc6..552b9299e5 100644
--- a/numba_dpex/core/target.py
+++ b/numba_dpex/core/target.py
@@ -9,7 +9,7 @@
 from llvmlite import ir as llvmir
 from llvmlite.llvmpy import core as lc
 from numba import typeof
-from numba.core import cgutils, datamodel, types, typing, utils
+from numba.core import cgutils, types, typing, utils
 from numba.core.base import BaseContext
 from numba.core.callconv import MinimalCallConv
 from numba.core.registry import cpu_target
@@ -17,12 +17,13 @@
 from numba.core.utils import cached_property
 
 from numba_dpex.core.datamodel.models import _init_data_model_manager
+from numba_dpex.core.exceptions import UnsupportedKernelArgumentError
+from numba_dpex.core.typeconv import to_usm_ndarray
+from numba_dpex.core.utils import get_info_from_suai
 from numba_dpex.utils import (
     address_space,
     calling_conv,
-    has_usm_memory,
     npytypes_array_to_dpex_array,
-    suai_to_dpex_array,
 )
 
 from .. import codegen
@@ -69,14 +70,20 @@ def resolve_argument_type(self, val):
         try:
             _type = type(typeof(val))
         except ValueError:
-            # For arbitrary array that is not recognized by Numba,
-            # we will end up in this path. We check if the array
-            # has __sycl_usm_array_interface__ attribute. If yes,
-            # we create the necessary Numba type to represent it
-            # and send it back.
-            if has_usm_memory(val) is not None:
-                return suai_to_dpex_array(val)
-
+            # When an array-like kernel argument is not recognized by
+            # numba-dpex, this additional check sees if the array-like object
+            # implements the __sycl_usm_array_interface__ protocol. For such
+            # cases, we treat the object as an UsmNdArray type.
+            try:
+                suai_attrs = get_info_from_suai(val)
+                return to_usm_ndarray(suai_attrs)
+            except Exception:
+                raise UnsupportedKernelArgumentError(
+                    type=str(type(val)), value=val
+                )
+
+        # FIXME: Remove once NumPy arrays are no longer supported as kernel
+        # args.
         if _type is types.npytypes.Array:
             # Convert npytypes.Array to numba_dpex.core.types.Array
             return npytypes_array_to_dpex_array(typeof(val))
diff --git a/numba_dpex/core/typeconv/__init__.py b/numba_dpex/core/typeconv/__init__.py
new file mode 100644
index 0000000000..61bb97a839
--- /dev/null
+++ b/numba_dpex/core/typeconv/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from .array_conversion import to_usm_ndarray
+
+__all__ = ["to_usm_ndarray"]
diff --git a/numba_dpex/core/typeconv/array_conversion.py b/numba_dpex/core/typeconv/array_conversion.py
new file mode 100644
index 0000000000..6e949c7349
--- /dev/null
+++ b/numba_dpex/core/typeconv/array_conversion.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from numba.np import numpy_support
+
+from numba_dpex.core.types import USMNdArray
+from numba_dpex.core.utils import get_info_from_suai
+from numba_dpex.utils.constants import address_space
+
+
+def to_usm_ndarray(suai_attrs, addrspace=address_space.GLOBAL):
+    """Converts an array-like object that has the _sycl_usm_array_interface__
+    attribute to numba_dpex.types.UsmNdArray.
+
+    Args:
+        suai_attrs: The extracted SUAI information for an array-like object.
+        addrspace: Address space this array is allocated in.
+
+    Returns: The Numba type for SUAI array.
+
+    Raises:
+        NotImplementedError: If the dtype of the passed array is not supported.
+    """
+    try:
+        dtype = numpy_support.from_dtype(suai_attrs.dtype)
+    except NotImplementedError:
+        raise ValueError("Unsupported array dtype: %s" % (dtype,))
+
+    # If converting from an unknown array-like object that implements
+    # __sycl_usm_array_interface__, the layout is always hard-coded to
+    # C-contiguous.
+    layout = "C"
+
+    return USMNdArray(
+        dtype=dtype,
+        ndim=suai_attrs.dimensions,
+        layout=layout,
+        usm_type=suai_attrs.usm_type,
+        device=suai_attrs.device,
+        readonly=not suai_attrs.is_writable,
+        name=None,
+        aligned=True,
+        addrspace=addrspace,
+    )
diff --git a/numba_dpex/core/utils/__init__.py b/numba_dpex/core/utils/__init__.py
new file mode 100644
index 0000000000..78bf969d57
--- /dev/null
+++ b/numba_dpex/core/utils/__init__.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from .suai_helper import SyclUSMArrayInterface, get_info_from_suai
+
+__all__ = [
+    "get_info_from_suai",
+    "SyclUSMArrayInterface",
+]
diff --git a/numba_dpex/core/utils/suai_helper.py b/numba_dpex/core/utils/suai_helper.py
new file mode 100644
index 0000000000..ea52bcc0dc
--- /dev/null
+++ b/numba_dpex/core/utils/suai_helper.py
@@ -0,0 +1,153 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+
+import dpctl
+import dpctl.memory as dpctl_mem
+import numpy as np
+
+
+class SyclUSMArrayInterface:
+    """Stores as attributes the information extracted from a
+    __sycl_usm_array_interface__ dictionary as defined by dpctl.memory.Memory*
+    classes.
+    """
+
+    def __init__(
+        self,
+        data,
+        writable,
+        size,
+        shape,
+        dimensions,
+        itemsize,
+        strides,
+        dtype,
+        usm_type,
+        device,
+    ):
+        self._data = data
+        self._data_writeable = writable
+        self._size = size
+        self._shape = shape
+        self._dimensions = dimensions
+        self._itemsize = itemsize
+        self._strides = strides
+        self._dtype = dtype
+        self._usm_type = usm_type
+        self._device = device
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def is_writable(self):
+        return self._data_writeable
+
+    @property
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @property
+    def dimensions(self):
+        return self._dimensions
+
+    @property
+    def itemsize(self):
+        return self._itemsize
+
+    @property
+    def strides(self):
+        return self._strides
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def usm_type(self):
+        return self._usm_type
+
+    @property
+    def device(self):
+        return self._device
+
+
+def get_info_from_suai(obj):
+    """
+    Extracts the metadata of an object of type UsmNdArray using the objects
+    __sycl_usm_array_interface__ (SUAI) attribute.
+
+    The ``dpctl.memory.as_usm_memory`` function converts the array-like
+    object into a dpctl.memory.USMMemory object. Using the ``as_usm_memory``
+    is an implicit way to verify if the array-like object is a legal
+    SYCL USM memory back Python object that can be passed to a dpex kernel.
+
+    Args:
+        obj: array-like object with a SUAI attribute.
+
+    Returns:
+        A SyclUSMArrayInterface object
+
+    """
+
+    # dpctl.as_usm_memory validated if an array-like object, obj, has a well
+    # defined __sycl_usm_array_interface__ dictionary and converts it into a
+    # dpctl.memory.Memory* object.
+    try:
+        usm_mem = dpctl_mem.as_usm_memory(obj)
+    except Exception as e:
+        logging.exception(
+            "Array like object with __sycl_usm_array_interface__ could not be "
+            "converted to a dpctl.memory.Memory* object."
+        )
+        raise e
+
+    # The data attribute of __sycl_usm_array_interface__ is a 2-tuple.
+    # The first element is the data pointer and the second a boolean
+    # value indicating if the data is writable.
+    is_writable = usm_mem.__sycl_usm_array_interface__["data"][1]
+
+    shape = obj.__sycl_usm_array_interface__["shape"]
+    total_size = np.prod(shape)
+    ndim = len(shape)
+    dtype = np.dtype(obj.__sycl_usm_array_interface__["typestr"])
+    itemsize = dtype.itemsize
+
+    strides = obj.__sycl_usm_array_interface__["strides"]
+    if strides is None:
+        strides = [1] * ndim
+        for i in reversed(range(1, ndim)):
+            strides[i - 1] = strides[i] * shape[i]
+        strides = tuple(strides)
+
+    syclobj = usm_mem.__sycl_usm_array_interface__["syclobj"]
+    if not isinstance(syclobj, dpctl.SyclQueue):
+        raise ValueError(
+            "dpctl.SyclQueue could not be inferred. "
+            "The __sycl_usm_array_interface__ may be malformed."
+        )
+    device = syclobj.sycl_device.filter_string
+    usm_type = usm_mem.get_usm_type()
+
+    suai_info = SyclUSMArrayInterface(
+        data=usm_mem,
+        writable=is_writable,
+        size=total_size,
+        usm_type=usm_type,
+        device=device,
+        shape=shape,
+        dimensions=ndim,
+        itemsize=itemsize,
+        strides=strides,
+        dtype=dtype,
+    )
+
+    return suai_info
diff --git a/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py b/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py
index 2a127715c7..0c83486dfe 100644
--- a/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py
+++ b/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py
@@ -33,7 +33,7 @@ def test_kernel_valid_usm_obj(dtype):
     """Test if a ``numba_dpex.kernel`` function accepts a DuckUSMArray argument.
 
     The ``DuckUSMArray`` uses ``dpctl.memory`` to allocate a Python object that
-    defines a ``__sycl_usm_array__interface__`` attribute. We test if
+    defines a ``__sycl_usm_array_interface__`` attribute. We test if
     ``numba_dpex`` recognizes the ``DuckUSMArray`` as a valid USM-backed Python
     object and accepts it as a kernel argument.
 
diff --git a/numba_dpex/utils/__init__.py b/numba_dpex/utils/__init__.py
index 345bf9f208..cf8068e8c0 100644
--- a/numba_dpex/utils/__init__.py
+++ b/numba_dpex/utils/__init__.py
@@ -10,7 +10,6 @@
     as_usm_obj,
     copy_from_numpy_to_usm_obj,
     copy_to_numpy_from_usm_obj,
-    get_info_from_suai,
     has_usm_memory,
 )
 from numba_dpex.utils.constants import address_space, calling_conv
@@ -28,10 +27,7 @@
     mix_datatype_err_msg,
 )
 from numba_dpex.utils.misc import IndeterminateExecutionQueueError
-from numba_dpex.utils.type_conversion_fns import (
-    npytypes_array_to_dpex_array,
-    suai_to_dpex_array,
-)
+from numba_dpex.utils.type_conversion_fns import npytypes_array_to_dpex_array
 
 __all__ = [
     "LLVMTypes",
@@ -42,7 +38,6 @@
     "get_one",
     "npytypes_array_to_dpex_array",
     "npytypes_array_to_dpex_array",
-    "suai_to_dpex_array",
     "address_space",
     "calling_conv",
     "has_usm_memory",
@@ -53,5 +48,4 @@
     "cfd_ctx_mgr_wrng_msg",
     "IndeterminateExecutionQueueError_msg",
     "mix_datatype_err_msg",
-    "get_info_from_suai",
 ]
diff --git a/numba_dpex/utils/array_utils.py b/numba_dpex/utils/array_utils.py
index 054910c633..a6e1ce6bf3 100644
--- a/numba_dpex/utils/array_utils.py
+++ b/numba_dpex/utils/array_utils.py
@@ -20,40 +20,6 @@
 ]
 
 
-def get_info_from_suai(obj):
-    """
-    Convenience function to gather information from __sycl_usm_array_interface__.
-
-    Args:
-        obj: Array with SUAI attribute.
-
-    Returns:
-        usm_mem: USM memory object.
-        total_size: Total number of items in the array.
-        shape: Shape of the array.
-        ndim: Total number of dimensions.
-        itemsize: Size of each item.
-        strides: Stride of the array.
-        dtype: Dtype of the array.
-    """
-    usm_mem = dpctl_mem.as_usm_memory(obj)
-
-    assert usm_mem is not None
-
-    shape = obj.__sycl_usm_array_interface__["shape"]
-    total_size = np.prod(obj.__sycl_usm_array_interface__["shape"])
-    ndim = len(obj.__sycl_usm_array_interface__["shape"])
-    itemsize = np.dtype(obj.__sycl_usm_array_interface__["typestr"]).itemsize
-    dtype = np.dtype(obj.__sycl_usm_array_interface__["typestr"])
-    strides = obj.__sycl_usm_array_interface__["strides"]
-    if strides is None:
-        strides = [1] * ndim
-        for i in reversed(range(1, ndim)):
-            strides[i - 1] = strides[i] * shape[i]
-        strides = tuple(strides)
-    return usm_mem, total_size, shape, ndim, itemsize, strides, dtype
-
-
 def has_usm_memory(obj):
     """
     Determine and return a SYCL device accessible object.
diff --git a/numba_dpex/utils/type_conversion_fns.py b/numba_dpex/utils/type_conversion_fns.py
index 5cfb76aad8..c9f02bcbed 100644
--- a/numba_dpex/utils/type_conversion_fns.py
+++ b/numba_dpex/utils/type_conversion_fns.py
@@ -9,14 +9,12 @@
 
 """
 from numba.core import types
-from numba.np import numpy_support
 
 from numba_dpex.core.types import Array
 
-from .array_utils import get_info_from_suai
 from .constants import address_space
 
-__all__ = ["npytypes_array_to_dpex_array", "suai_to_dpex_array"]
+__all__ = ["npytypes_array_to_dpex_array"]
 
 
 def npytypes_array_to_dpex_array(arrtype, addrspace=address_space.GLOBAL):
@@ -63,47 +61,3 @@ def npytypes_array_to_dpex_array(arrtype, addrspace=address_space.GLOBAL):
         )
     else:
         raise NotImplementedError
-
-
-def suai_to_dpex_array(arr, addrspace=address_space.GLOBAL):
-    """Create type for Array with __sycl_usm_array_interface__ (SUAI) attribute.
-
-    This function creates a Numba type for arrays with SUAI attribute.
-
-    Args:
-        arr: Array with SUAI attribute.
-        addrspace: Address space this array is allocated in.
-
-    Returns: The Numba type for SUAI array.
-
-    Raises:
-        NotImplementedError: If the dtype of the passed array is not supported.
-    """
-    from numba_dpex.core.types import USMNdArray
-
-    (
-        usm_mem,
-        total_size,
-        shape,
-        ndim,
-        itemsize,
-        strides,
-        dtype,
-    ) = get_info_from_suai(arr)
-
-    try:
-        dtype = numpy_support.from_dtype(dtype)
-    except NotImplementedError:
-        raise ValueError("Unsupported array dtype: %s" % (dtype,))
-
-    layout = "C"
-    readonly = False
-
-    return USMNdArray(
-        dtype,
-        ndim,
-        layout,
-        None,
-        readonly,
-        addrspace=addrspace,
-    )

From 3639f619ae552d4022bfd4bf2e8ee94e3629885e Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 7 Jan 2023 13:13:38 -0600
Subject: [PATCH 33/51] Fix failing test_barrier tests and add checks to
 disallow specialization using strings.

---
 numba_dpex/decorators.py                      | 24 +++++++++-
 numba_dpex/tests/kernel_tests/test_barrier.py | 48 +++++++++----------
 .../test_kernel_specialization.py             | 15 ++++++
 3 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py
index 22db0a88cc..ace5354e97 100644
--- a/numba_dpex/decorators.py
+++ b/numba_dpex/decorators.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from warnings import warn
+import inspect
 
 from numba.core import sigutils, types
 
@@ -48,7 +48,22 @@ def _kernel_dispatcher(pyfunc, sigs=None):
 
     if func_or_sig is None:
         return _kernel_dispatcher
+    elif isinstance(func_or_sig, str):
+        raise NotImplementedError(
+            "Specifying signatures as string is not yet supported by numba-dpex"
+        )
     elif isinstance(func_or_sig, list) or sigutils.is_signature(func_or_sig):
+        # String signatures are not supported as passing usm_ndarray type as
+        # a string is not possible. Numba's sigutils relies on the type being
+        # available in Numba's types.__dpct__ and dpex types are not registered
+        # there yet.
+        if isinstance(func_or_sig, list):
+            for sig in func_or_sig:
+                if isinstance(sig, str):
+                    raise NotImplementedError(
+                        "Specifying signatures as string is not yet supported "
+                        "by numba-dpex"
+                    )
         # Specialized signatures can either be a single signature or a list.
         # In case only one signature is provided convert it to a list
         if not isinstance(func_or_sig, list):
@@ -69,6 +84,13 @@ def _specialized_kernel_dispatcher(pyfunc):
         return _specialized_kernel_dispatcher
     else:
         func = func_or_sig
+        if not inspect.isfunction(func):
+            raise ValueError(
+                "Argument passed to the kernel decorator is neither a "
+                "function object, nor a signature. If you are trying to "
+                "specialize the kernel that takes a single argument, specify "
+                "the return type as void explicitly."
+            )
         return _kernel_dispatcher(func)
 
 
diff --git a/numba_dpex/tests/kernel_tests/test_barrier.py b/numba_dpex/tests/kernel_tests/test_barrier.py
index 8bc8ef5299..651adb0e2e 100644
--- a/numba_dpex/tests/kernel_tests/test_barrier.py
+++ b/numba_dpex/tests/kernel_tests/test_barrier.py
@@ -2,20 +2,22 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import platform
-
 import dpctl
+import dpctl.tensor as dpt
 import numpy as np
 import pytest
 
 import numba_dpex as dpex
+from numba_dpex import float32, usm_ndarray, void
 from numba_dpex.tests._helper import filter_strings
 
+f32arrty = usm_ndarray(float32, 1, "C", usm_type="device", device="0")
+
 
 @pytest.mark.parametrize("filter_str", filter_strings)
 def test_proper_lowering(filter_str):
     # This will trigger eager compilation
-    @dpex.kernel("void(float32[::1])")
+    @dpex.kernel(void(f32arrty))
     def twice(A):
         i = dpex.get_global_id(0)
         d = A[i]
@@ -23,19 +25,17 @@ def twice(A):
         A[i] = d * 2
 
     N = 256
-    arr = np.random.random(N).astype(np.float32)
-    orig = arr.copy()
-
-    with dpctl.device_context(filter_str):
-        twice[N, N // 2](arr)
-
+    arr = dpt.arange(N, dtype=dpt.float32, device="0")
+    orig = dpt.asnumpy(arr)
+    twice[N, N // 2](arr)
+    after = dpt.asnumpy(arr)
     # The computation is correct?
-    np.testing.assert_allclose(orig * 2, arr)
+    np.testing.assert_allclose(orig * 2, after)
 
 
 @pytest.mark.parametrize("filter_str", filter_strings)
 def test_no_arg_barrier_support(filter_str):
-    @dpex.kernel("void(float32[::1])")
+    @dpex.kernel(void(f32arrty))
     def twice(A):
         i = dpex.get_global_id(0)
         d = A[i]
@@ -44,21 +44,19 @@ def twice(A):
         A[i] = d * 2
 
     N = 256
-    arr = np.random.random(N).astype(np.float32)
-    orig = arr.copy()
-
-    with dpctl.device_context(filter_str):
-        twice[N, dpex.DEFAULT_LOCAL_SIZE](arr)
-
+    arr = dpt.arange(N, dtype=dpt.float32, device="0")
+    orig = dpt.asnumpy(arr)
+    twice[N, dpex.DEFAULT_LOCAL_SIZE](arr)
+    after = dpt.asnumpy(arr)
     # The computation is correct?
-    np.testing.assert_allclose(orig * 2, arr)
+    np.testing.assert_allclose(orig * 2, after)
 
 
 @pytest.mark.parametrize("filter_str", filter_strings)
 def test_local_memory(filter_str):
     blocksize = 10
 
-    @dpex.kernel("void(float32[::1])")
+    @dpex.kernel(void(f32arrty))
     def reverse_array(A):
         lm = dpex.local.array(shape=10, dtype=np.float32)
         i = dpex.get_global_id(0)
@@ -70,11 +68,9 @@ def reverse_array(A):
         # write
         A[i] += lm[blocksize - 1 - i]
 
-    arr = np.arange(blocksize).astype(np.float32)
-    orig = arr.copy()
-
-    with dpctl.device_context(filter_str):
-        reverse_array[blocksize, blocksize](arr)
-
+    arr = dpt.arange(blocksize, dtype=dpt.float32, device="0")
+    orig = dpt.asnumpy(arr)
+    reverse_array[blocksize, blocksize](arr)
+    after = dpt.asnumpy(arr)
     expected = orig[::-1] + orig
-    np.testing.assert_allclose(expected, arr)
+    np.testing.assert_allclose(expected, after)
diff --git a/numba_dpex/tests/kernel_tests/test_kernel_specialization.py b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
index 420612a896..dbe31275af 100644
--- a/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
+++ b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
@@ -79,3 +79,18 @@ def test_execution_of_specialized_kernel():
 
     npc_expected = np.full(1024, 2, dtype=np.int64)
     assert np.array_equal(npc, npc_expected)
+
+
+def test_string_specialization():
+    """Test if NotImplementedError is raised when signature is a string"""
+
+    with pytest.raises(NotImplementedError):
+        dpex.kernel("(i64arrty, i64arrty, i64arrty)")
+
+    with pytest.raises(NotImplementedError):
+        dpex.kernel(
+            ["(i64arrty, i64arrty, i64arrty)", "(f32arrty, f32arrty, f32arrty)"]
+        )
+
+    with pytest.raises(ValueError):
+        dpex.kernel((i64arrty))

From b1ad49c1aaf3c3bc158895ece731665a34e84f5f Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 7 Jan 2023 14:26:23 -0600
Subject: [PATCH 34/51] Fix test_kernel_has_return_value_error test case.

---
 .../test_kernel_has_return_value_error.py      | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py b/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py
index 3140cffd0d..7451711eea 100644
--- a/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py
+++ b/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py
@@ -2,12 +2,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import dpctl
+import dpctl.tensor as dpt
 import numpy as np
 import pytest
 
 import numba_dpex as dpex
-from numba_dpex.tests._helper import filter_strings
+from numba_dpex import int32, usm_ndarray
+
+i32arrty = usm_ndarray(int32, 1, "C", usm_type="device", device="0")
 
 
 def f(a):
@@ -16,7 +18,7 @@ def f(a):
 
 list_of_sig = [
     None,
-    ("int32[::1](int32[::1])"),
+    (i32arrty(i32arrty)),
 ]
 
 
@@ -25,13 +27,9 @@ def sig(request):
     return request.param
 
 
-@pytest.mark.parametrize("filter_str", filter_strings)
-def test_return(filter_str, sig):
-    a = np.array(np.random.random(122), np.int32)
+def test_return(sig):
+    a = dpt.arange(1024, dtype=dpt.int32, device="0")
 
     with pytest.raises(dpex.core.exceptions.KernelHasReturnValueError):
         kernel = dpex.kernel(sig)(f)
-
-        device = dpctl.SyclDevice(filter_str)
-        with dpctl.device_context(device):
-            kernel[a.size, dpex.DEFAULT_LOCAL_SIZE](a)
+        kernel[a.size, dpex.DEFAULT_LOCAL_SIZE](a)

From bca00c5c3ce7c6447b1dd6451f0dc3ac3f43963e Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 7 Jan 2023 20:50:54 -0600
Subject: [PATCH 35/51] Address review comments.

   - Move compile_with_dpex into the core.compiler module.
   - The target and typing contexts are now not set from dpex_target
     inside the SpirvKernel.compile. They are passed in as args.
   - Fix tests impacted by these changes.
---
 numba_dpex/core/_compile_helper.py            | 99 -------------------
 numba_dpex/core/compiler.py                   | 96 +++++++++++++++++-
 .../core/kernel_interface/dispatcher.py       | 12 ++-
 numba_dpex/core/kernel_interface/func.py      | 11 ++-
 .../core/kernel_interface/kernel_base.py      |  2 +-
 .../core/kernel_interface/spirv_kernel.py     | 55 ++++++++---
 .../tests/kernel_tests/test_atomic_op.py      |  9 +-
 numba_dpex/tests/test_debuginfo.py            |  9 +-
 8 files changed, 166 insertions(+), 127 deletions(-)
 delete mode 100644 numba_dpex/core/_compile_helper.py

diff --git a/numba_dpex/core/_compile_helper.py b/numba_dpex/core/_compile_helper.py
deleted file mode 100644
index d0aa0b30ee..0000000000
--- a/numba_dpex/core/_compile_helper.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# SPDX-FileCopyrightText: 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from types import FunctionType
-
-from numba.core import compiler, ir
-from numba.core import types as numba_types
-from numba.core.compiler_lock import global_compiler_lock
-
-from numba_dpex import config
-from numba_dpex.core import compiler as dpex_compiler
-from numba_dpex.core.descriptor import dpex_target
-from numba_dpex.core.exceptions import (
-    KernelHasReturnValueError,
-    UnreachableError,
-)
-
-
-@global_compiler_lock
-def compile_with_dpex(
-    pyfunc,
-    pyfunc_name,
-    args,
-    return_type,
-    debug=None,
-    is_kernel=True,
-    extra_compile_flags=None,
-):
-    """
-    Compiles the function using the dpex compiler pipeline and returns the
-    compiled result.
-
-    Args:
-        args: The list of arguments passed to the kernel.
-        debug (bool): Optional flag to turn on debug mode compilation.
-        extra_compile_flags: Extra flags passed to the compiler.
-
-    Returns:
-        cres: Compiled result.
-
-    Raises:
-        KernelHasReturnValueError: If the compiled function returns a
-        non-void value.
-    """
-    # First compilation will trigger the initialization of the backend.
-    typingctx = dpex_target.typing_context
-    targetctx = dpex_target.target_context
-
-    flags = compiler.Flags()
-    # Do not compile the function to a binary, just lower to LLVM
-    flags.debuginfo = config.DEBUGINFO_DEFAULT
-    flags.no_compile = True
-    flags.no_cpython_wrapper = True
-    flags.nrt = False
-
-    if debug is not None:
-        flags.debuginfo = debug
-
-    # Run compilation pipeline
-    if isinstance(pyfunc, FunctionType):
-        cres = compiler.compile_extra(
-            typingctx=typingctx,
-            targetctx=targetctx,
-            func=pyfunc,
-            args=args,
-            return_type=return_type,
-            flags=flags,
-            locals={},
-            pipeline_class=dpex_compiler.Compiler,
-        )
-    elif isinstance(pyfunc, ir.FunctionIR):
-        cres = compiler.compile_ir(
-            typingctx=typingctx,
-            targetctx=targetctx,
-            func_ir=pyfunc,
-            args=args,
-            return_type=return_type,
-            flags=flags,
-            locals={},
-            pipeline_class=dpex_compiler.Compiler,
-        )
-    else:
-        raise UnreachableError()
-
-    if (
-        is_kernel
-        and cres.signature.return_type is not None
-        and cres.signature.return_type != numba_types.void
-    ):
-        raise KernelHasReturnValueError(
-            kernel_name=pyfunc_name,
-            return_type=cres.signature.return_type,
-        )
-    # Linking depending libraries
-    library = cres.library
-    library.finalize()
-
-    return cres
diff --git a/numba_dpex/core/compiler.py b/numba_dpex/core/compiler.py
index df5d87aef4..5dd0445ee2 100644
--- a/numba_dpex/core/compiler.py
+++ b/numba_dpex/core/compiler.py
@@ -2,7 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from types import FunctionType
+
+from numba.core import compiler, ir
+from numba.core import types as numba_types
 from numba.core.compiler import CompilerBase
+from numba.core.compiler_lock import global_compiler_lock
 from numba.core.compiler_machinery import PassManager
 from numba.core.typed_passes import (
     AnnotateTypes,
@@ -30,7 +35,12 @@
     WithLifting,
 )
 
-from numba_dpex.core.exceptions import UnsupportedCompilationModeError
+from numba_dpex import config
+from numba_dpex.core.exceptions import (
+    KernelHasReturnValueError,
+    UnreachableError,
+    UnsupportedCompilationModeError,
+)
 from numba_dpex.core.passes.passes import (
     ConstantSizeStaticLocalMemoryPass,
     DpexLowering,
@@ -205,3 +215,87 @@ def define_pipelines(self):
         if self.state.status.can_fallback or self.state.flags.force_pyobject:
             raise UnsupportedCompilationModeError()
         return pms
+
+
+@global_compiler_lock
+def compile_with_dpex(
+    pyfunc,
+    pyfunc_name,
+    args,
+    return_type,
+    target_context,
+    typing_context,
+    debug=None,
+    is_kernel=True,
+    extra_compile_flags=None,
+):
+    """
+    Compiles a function using the dpex compiler pipeline and returns the
+    compiled result.
+
+    Args:
+        args: The list of arguments passed to the kernel.
+        debug (bool): Optional flag to turn on debug mode compilation.
+        extra_compile_flags: Extra flags passed to the compiler.
+
+    Returns:
+        cres: Compiled result.
+
+    Raises:
+        KernelHasReturnValueError: If the compiled function returns a
+        non-void value.
+    """
+    # First compilation will trigger the initialization of the backend.
+    typingctx = typing_context
+    targetctx = target_context
+
+    flags = compiler.Flags()
+    # Do not compile the function to a binary, just lower to LLVM
+    flags.debuginfo = config.DEBUGINFO_DEFAULT
+    flags.no_compile = True
+    flags.no_cpython_wrapper = True
+    flags.nrt = False
+
+    if debug is not None:
+        flags.debuginfo = debug
+
+    # Run compilation pipeline
+    if isinstance(pyfunc, FunctionType):
+        cres = compiler.compile_extra(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=Compiler,
+        )
+    elif isinstance(pyfunc, ir.FunctionIR):
+        cres = compiler.compile_ir(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func_ir=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=Compiler,
+        )
+    else:
+        raise UnreachableError()
+
+    if (
+        is_kernel
+        and cres.signature.return_type is not None
+        and cres.signature.return_type != numba_types.void
+    ):
+        raise KernelHasReturnValueError(
+            kernel_name=pyfunc_name,
+            return_type=cres.signature.return_type,
+        )
+    # Linking depending libraries
+    library = cres.library
+    library.finalize()
+
+    return cres
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 4c03f3b91c..bb91dcd399 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -9,7 +9,7 @@
 
 import dpctl
 import dpctl.program as dpctl_prog
-from numba.core import sigutils, types, utils
+from numba.core import sigutils
 from numba.core.types import Array as NpArrayType
 from numba.core.types import void
 
@@ -138,11 +138,17 @@ def cache_hits(self):
         return self._cache_hits
 
     def _compile_and_cache(self, argtypes, backend, device_type, cache):
+        # We always compile the kernel using the dpex_target.
+        typingctx = dpex_target.typing_context
+        targetctx = dpex_target.target_context
+
         kernel = SpirvKernel(self.pyfunc, self.kernel_name)
         kernel.compile(
-            arg_types=argtypes,
+            args=argtypes,
+            typing_ctx=typingctx,
+            target_ctx=targetctx,
             debug=self.debug_flags,
-            extra_compile_flags=self.compile_flags,
+            compile_flags=self.compile_flags,
         )
 
         device_driver_ir_module = kernel.device_driver_ir_module
diff --git a/numba_dpex/core/kernel_interface/func.py b/numba_dpex/core/kernel_interface/func.py
index c0c24789fe..42fe2e19c1 100644
--- a/numba_dpex/core/kernel_interface/func.py
+++ b/numba_dpex/core/kernel_interface/func.py
@@ -8,7 +8,8 @@
 
 from numba.core.typing.templates import AbstractTemplate, ConcreteTemplate
 
-from numba_dpex.core._compile_helper import compile_with_dpex
+from numba_dpex.core.compiler import compile_with_dpex
+from numba_dpex.core.descriptor import dpex_target
 
 
 def compile_func(pyfunc, return_type, args, debug=None):
@@ -16,6 +17,8 @@ def compile_func(pyfunc, return_type, args, debug=None):
         pyfunc=pyfunc,
         pyfunc_name=pyfunc.__name__,
         return_type=return_type,
+        target_context=dpex_target.target_context,
+        typing_context=dpex_target.typing_context,
         args=args,
         is_kernel=False,
         debug=debug,
@@ -36,7 +39,6 @@ class _function_template(ConcreteTemplate):
 
 def compile_func_template(pyfunc, debug=None):
     """Compile a DpexFunctionTemplate"""
-    from numba_dpex.core.descriptor import dpex_target
 
     dft = DpexFunctionTemplate(pyfunc, debug=debug)
 
@@ -48,8 +50,7 @@ def generic(self, args, kws):
                 raise AssertionError("No keyword arguments allowed.")
             return dft.compile(args)
 
-    typingctx = dpex_target.typing_context
-    typingctx.insert_user_function(dft, _function_template)
+    dpex_target.typing_context.insert_user_function(dft, _function_template)
     return dft
 
 
@@ -73,6 +74,8 @@ def compile(self, args):
                 pyfunc=self.py_func,
                 pyfunc_name=self.py_func.__name__,
                 return_type=None,
+                target_context=dpex_target.target_context,
+                typing_context=dpex_target.typing_context,
                 args=args,
                 is_kernel=False,
                 debug=self.debug,
diff --git a/numba_dpex/core/kernel_interface/kernel_base.py b/numba_dpex/core/kernel_interface/kernel_base.py
index 12c32c2446..b0dd31f5b3 100644
--- a/numba_dpex/core/kernel_interface/kernel_base.py
+++ b/numba_dpex/core/kernel_interface/kernel_base.py
@@ -55,6 +55,6 @@ def module_name(self):
         raise NotImplementedError
 
     @abc.abstractmethod
-    def compile(self, target_ctx, args, debug, compile_flags):
+    def compile(self, target_ctx, typing_ctx, args, debug, compile_flags):
         """Abstract method to compile a Kernel instance."""
         raise NotImplementedError
diff --git a/numba_dpex/core/kernel_interface/spirv_kernel.py b/numba_dpex/core/kernel_interface/spirv_kernel.py
index cb11f345b5..932f4edd9a 100644
--- a/numba_dpex/core/kernel_interface/spirv_kernel.py
+++ b/numba_dpex/core/kernel_interface/spirv_kernel.py
@@ -8,7 +8,7 @@
 from numba.core import ir
 
 from numba_dpex import spirv_generator
-from numba_dpex.core import _compile_helper
+from numba_dpex.core.compiler import compile_with_dpex
 from numba_dpex.core.exceptions import UncompiledKernelError, UnreachableError
 
 from .kernel_base import KernelInterface
@@ -71,34 +71,67 @@ def module_name(self):
 
     @property
     def target_context(self):
+        """Returns the target context that was used to compile the kernel.
+
+        Raises:
+            UncompiledKernelError: If the kernel was not yet compiled.
+
+        Returns:
+            target context used to compile the kernel
+        """
         if self._target_context:
             return self._target_context
         else:
             raise UncompiledKernelError(self._pyfunc_name)
 
-    def compile(self, arg_types, debug, extra_compile_flags):
-        """_summary_
+    @property
+    def typing_context(self):
+        """Returns the typing context that was used to compile the kernel.
+
+        Raises:
+            UncompiledKernelError: If the kernel was not yet compiled.
+
+        Returns:
+            typing context used to compile the kernel
+        """
+        if self._typing_context:
+            return self._typing_context
+        else:
+            raise UncompiledKernelError(self._pyfunc_name)
+
+    def compile(
+        self,
+        target_ctx,
+        typing_ctx,
+        args,
+        debug,
+        compile_flags,
+    ):
+        """Compiles a kernel using numba_dpex.core.compiler.Compiler.
 
         Args:
-            arg_types (_type_): _description_
+            args (_type_): _description_
             debug (_type_): _description_
-            extra_compile_flags (_type_): _description_
+            compile_flags (_type_): _description_
         """
 
-        logging.debug("compiling SpirvKernel with arg types", arg_types)
+        logging.debug("compiling SpirvKernel with arg types", args)
+
+        self._target_context = target_ctx
+        self._typing_context = typing_ctx
 
-        cres = _compile_helper.compile_with_dpex(
+        cres = compile_with_dpex(
             self._func,
             self._pyfunc_name,
-            args=arg_types,
+            args=args,
             return_type=None,
             debug=debug,
             is_kernel=True,
-            extra_compile_flags=extra_compile_flags,
+            typing_context=typing_ctx,
+            target_context=target_ctx,
+            extra_compile_flags=compile_flags,
         )
 
-        self._target_context = cres.target_context
-
         func = cres.library.get_function(cres.fndesc.llvm_func_name)
         kernel = cres.target_context.prepare_ocl_kernel(
             func, cres.signature.args
diff --git a/numba_dpex/tests/kernel_tests/test_atomic_op.py b/numba_dpex/tests/kernel_tests/test_atomic_op.py
index fd036fed10..95e25d08f9 100644
--- a/numba_dpex/tests/kernel_tests/test_atomic_op.py
+++ b/numba_dpex/tests/kernel_tests/test_atomic_op.py
@@ -2,14 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-
 import dpctl
 import numpy as np
 import pytest
 
 import numba_dpex as dpex
 from numba_dpex import config
+from numba_dpex.core.descriptor import dpex_target
 from numba_dpex.tests._helper import filter_strings, override_config
 
 global_size = 100
@@ -217,9 +216,11 @@ def test_atomic_fp_native(
 
     with override_config("NATIVE_FP_ATOMICS", NATIVE_FP_ATOMICS):
         kernel.compile(
-            arg_types=argtypes,
+            args=argtypes,
             debug=None,
-            extra_compile_flags=None,
+            compile_flags=None,
+            target_ctx=dpex_target.target_context,
+            typing_ctx=dpex_target.typing_context,
         )
 
         is_native_atomic = expected_spirv_function in kernel._llvm_module
diff --git a/numba_dpex/tests/test_debuginfo.py b/numba_dpex/tests/test_debuginfo.py
index 7fc5cc1e64..4b2eadf15e 100644
--- a/numba_dpex/tests/test_debuginfo.py
+++ b/numba_dpex/tests/test_debuginfo.py
@@ -6,12 +6,11 @@
 
 import re
 
-import dpctl
 import pytest
 from numba.core import types
 
 import numba_dpex as dpex
-from numba_dpex import compiler
+from numba_dpex.core.descriptor import dpex_target
 from numba_dpex.tests._helper import override_config
 from numba_dpex.utils import npytypes_array_to_dpex_array
 
@@ -28,9 +27,11 @@ def get_kernel_ir(fn, sig, debug=None):
         fn, fn.__name__
     )
     kernel.compile(
-        arg_types=sig,
+        args=sig,
+        target_ctx=dpex_target.target_context,
+        typing_ctx=dpex_target.typing_context,
         debug=debug,
-        extra_compile_flags=None,
+        compile_flags=None,
     )
     return kernel.llvm_module
 

From 876d4d295fe87676cb0dc54b49c19ea535184557 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Mon, 9 Jan 2023 23:12:08 -0600
Subject: [PATCH 36/51] Use new compiler to compiler parfors.

---
 numba_dpex/core/passes/lowerer.py | 72 +++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 13 deletions(-)

diff --git a/numba_dpex/core/passes/lowerer.py b/numba_dpex/core/passes/lowerer.py
index 8dfde3e6c9..b12cf31f5b 100644
--- a/numba_dpex/core/passes/lowerer.py
+++ b/numba_dpex/core/passes/lowerer.py
@@ -11,6 +11,7 @@
 from collections import OrderedDict
 
 import dpctl
+import dpctl.program as dpctl_prog
 import numba
 import numpy as np
 from numba.core import compiler, funcdesc, ir, lowering, sigutils, types
@@ -44,6 +45,7 @@
 
 import numba_dpex as dpex
 from numba_dpex import config
+from numba_dpex.core.descriptor import dpex_target
 from numba_dpex.core.target import DpexTargetContext
 from numba_dpex.core.types import Array
 from numba_dpex.dpctl_iface import KernelLaunchOps
@@ -52,6 +54,57 @@
 from .dufunc_inliner import dufunc_inliner
 
 
+def _compile_kernel_parfor(
+    sycl_queue, kernel_name, func_ir, args, args_with_addrspaces, debug=None
+):
+    # We only accept numba_dpex.core.types.Array type
+    for arg in args_with_addrspaces:
+        if isinstance(arg, types.npytypes.Array) and not isinstance(arg, Array):
+            raise TypeError(
+                "Only numba_dpex.core.types.Array objects are supported as "
+                + "kernel arguments. Received %s" % (type(arg))
+            )
+    if config.DEBUG:
+        print("compile_kernel_parfor", args)
+        for a in args_with_addrspaces:
+            print(a, type(a))
+            if isinstance(a, types.npytypes.Array):
+                print("addrspace:", a.addrspace)
+
+    # Create a SPIRVKernel object
+    kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel(
+        func_ir, kernel_name
+    )
+
+    # compile the kernel
+    kernel.compile(
+        args=args_with_addrspaces,
+        typing_ctx=dpex_target.typing_context,
+        target_ctx=dpex_target.target_context,
+        debug=debug,
+        compile_flags=None,
+    )
+
+    # Compile a SYCL Kernel object rom the SPIRVKernel
+
+    dpctl_create_program_from_spirv_flags = []
+
+    if debug or config.OPT == 0:
+        # if debug is ON we need to pass additional flags to igc.
+        dpctl_create_program_from_spirv_flags = ["-g", "-cl-opt-disable"]
+
+    # create a program
+    kernel_bundle = dpctl_prog.create_program_from_spirv(
+        sycl_queue,
+        kernel.device_driver_ir_module,
+        " ".join(dpctl_create_program_from_spirv_flags),
+    )
+    #  create a kernel
+    sycl_kernel = kernel_bundle.get_sycl_kernel(kernel.module_name)
+
+    return sycl_kernel
+
+
 def _print_block(block):
     for i, inst in enumerate(block.body):
         print("    ", i, inst)
@@ -268,13 +321,9 @@ def _create_gufunc_for_parfor_body(
     lowerer,
     parfor,
     typemap,
-    typingctx,
-    targetctx,
     flags,
     loop_ranges,
-    locals,
     has_aliases,
-    index_var_typ,
     races,
 ):
     """
@@ -656,8 +705,9 @@ def print_arg_with_addrspaces(args):
         print("after DUFunc inline".center(80, "-"))
         gufunc_ir.dump()
 
-    kernel_func = dpex.compiler.compile_kernel_parfor(
+    sycl_kernel = _compile_kernel_parfor(
         dpctl.get_current_queue(),
+        gufunc_name,
         gufunc_ir,
         gufunc_param_types,
         param_types_addrspaces,
@@ -669,7 +719,7 @@ def print_arg_with_addrspaces(args):
     if config.DEBUG_ARRAY_OPT:
         print("kernel_sig = ", kernel_sig)
 
-    return kernel_func, parfor_args, kernel_sig, func_arg_types, setitems
+    return sycl_kernel, parfor_args, kernel_sig, func_arg_types, setitems
 
 
 def _lower_parfor_gufunc(lowerer, parfor):
@@ -762,13 +812,9 @@ def _lower_parfor_gufunc(lowerer, parfor):
             lowerer,
             parfor,
             typemap,
-            typingctx,
-            targetctx,
             flags,
             loop_ranges,
-            {},
             bool(alias_map),
-            index_var_typ,
             parfor.races,
         )
     finally:
@@ -893,7 +939,7 @@ def bump_alpha(c, class_map):
 
 def generate_kernel_launch_ops(
     lowerer,
-    cres,
+    kernel,
     gu_signature,
     outer_sig,
     expr_args,
@@ -930,7 +976,7 @@ def generate_kernel_launch_ops(
         print("modified_arrays", modified_arrays)
 
     # get dpex_cpu_portion_lowerer object
-    kernel_launcher = KernelLaunchOps(lowerer, cres.kernel, num_inputs)
+    kernel_launcher = KernelLaunchOps(lowerer, kernel, num_inputs)
 
     # Get a pointer to the current queue
     curr_queue = kernel_launcher.get_current_queue()
@@ -977,7 +1023,7 @@ def val_type_or_none(context, lowerer, x):
     ]
     all_args = [loadvar_or_none(lowerer, x) for x in expr_args[:ninouts]]
 
-    keep_alive_kernels.append(cres)
+    keep_alive_kernels.append(kernel)
 
     # Call clSetKernelArg for each arg and create arg array for
     # the enqueue function. Put each part of each argument into

From da1440f8878103b4c5f00737c18c9454b3d959ac Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 7 Jan 2023 12:03:22 -0600
Subject: [PATCH 37/51] Fully remove numba_dpex.compiler module.

---
 numba_dpex/compiler.py                      | 708 --------------------
 numba_dpex/tests/test_no_copy_usm_shared.py |   2 +-
 2 files changed, 1 insertion(+), 709 deletions(-)
 delete mode 100644 numba_dpex/compiler.py

diff --git a/numba_dpex/compiler.py b/numba_dpex/compiler.py
deleted file mode 100644
index 3faa8981ae..0000000000
--- a/numba_dpex/compiler.py
+++ /dev/null
@@ -1,708 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import copy
-import ctypes
-import warnings
-from inspect import signature
-from types import FunctionType
-
-import dpctl
-import dpctl.program as dpctl_prog
-import dpctl.utils
-import numpy as np
-from numba.core import compiler, ir, types
-from numba.core.compiler_lock import global_compiler_lock
-
-from numba_dpex import config
-from numba_dpex.core.compiler import Compiler
-from numba_dpex.core.exceptions import KernelHasReturnValueError
-from numba_dpex.core.types import Array, USMNdArray
-from numba_dpex.core.utils import get_info_from_suai
-from numba_dpex.dpctl_support import dpctl_version
-from numba_dpex.utils import (
-    IndeterminateExecutionQueueError,
-    as_usm_obj,
-    cfd_ctx_mgr_wrng_msg,
-    copy_from_numpy_to_usm_obj,
-    copy_to_numpy_from_usm_obj,
-    has_usm_memory,
-    mix_datatype_err_msg,
-)
-
-from . import spirv_generator
-
-_RO_KERNEL_ARG = "read_only"
-_WO_KERNEL_ARG = "write_only"
-_RW_KERNEL_ARG = "read_write"
-
-
-def _raise_datatype_mixed_error(argtypes):
-    error_message = mix_datatype_err_msg + ("%s" % str(argtypes))
-    raise TypeError(error_message)
-
-
-def _raise_no_device_found_error():
-    error_message = (
-        "No SYCL device specified. "
-        "Usage : jit_fn[device, globalsize, localsize](...)"
-    )
-    raise ValueError(error_message)
-
-
-def _raise_invalid_kernel_enqueue_args():
-    error_message = (
-        "Incorrect number of arguments for enqueuing numba_dpex.kernel. "
-        "Usage: device_env, global size, local size. "
-        "The local size argument is optional."
-    )
-    raise ValueError(error_message)
-
-
-@global_compiler_lock
-def compile_with_depx(pyfunc, return_type, args, is_kernel, debug=None):
-    """
-    Compiles the function using the dpex compiler pipeline and returns the
-    compiled result.
-
-    Args:
-        pyfunc: The Python function to be compiled.
-        return_type: The Numba type of the return value.
-        args: The list of arguments sent to the Python function.
-        is_kernel (bool): Indicates whether the function is decorated
-        with @numba_depx.kernel or not.
-        debug (bool): Flag to turn debug mode ON/OFF.
-
-    Returns:
-        cres: Compiled result.
-
-    Raises:
-        TypeError: @numba_depx.kernel does not allow users to return any
-            value. TypeError is raised when users do.
-
-    """
-    # First compilation will trigger the initialization of the backend.
-    from .core.descriptor import dpex_target
-
-    typingctx = dpex_target.typing_context
-    targetctx = dpex_target.target_context
-
-    flags = compiler.Flags()
-    # Do not compile (generate native code), just lower (to LLVM)
-    flags.debuginfo = config.DEBUGINFO_DEFAULT
-    flags.no_compile = True
-    flags.no_cpython_wrapper = True
-    flags.nrt = False
-
-    if debug is not None:
-        flags.debuginfo = debug
-
-    # Run compilation pipeline
-    if isinstance(pyfunc, FunctionType):
-        cres = compiler.compile_extra(
-            typingctx=typingctx,
-            targetctx=targetctx,
-            func=pyfunc,
-            args=args,
-            return_type=return_type,
-            flags=flags,
-            locals={},
-            pipeline_class=Compiler,
-        )
-    elif isinstance(pyfunc, ir.FunctionIR):
-        cres = compiler.compile_ir(
-            typingctx=typingctx,
-            targetctx=targetctx,
-            func_ir=pyfunc,
-            args=args,
-            return_type=return_type,
-            flags=flags,
-            locals={},
-            pipeline_class=Compiler,
-        )
-    else:
-        assert 0
-
-    if (
-        is_kernel
-        and cres.signature.return_type is not None
-        and cres.signature.return_type != types.void
-    ):
-        raise KernelHasReturnValueError(
-            kernel_name=pyfunc.__name__, return_type=cres.signature.return_type
-        )
-
-    # Linking depending libraries
-    library = cres.library
-    library.finalize()
-
-    return cres
-
-
-def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=None):
-    # For any array we only accept numba_dpex.types.Array
-    for arg in args:
-        if isinstance(arg, types.npytypes.Array) and not (
-            isinstance(arg, Array) or isinstance(arg, USMNdArray)
-        ):
-            raise TypeError(
-                "Only numba_dpex.core.types.USMNdArray "
-                + "objects are supported as kernel arguments. "
-                + "Received %s" % (type(arg))
-            )
-
-    if config.DEBUG:
-        print("compile_kernel", args)
-        debug = True
-    if not sycl_queue:
-        # We expect the sycl_queue to be provided when this function is called
-        raise ValueError("SYCL queue is required for compiling a kernel")
-
-    cres = compile_with_depx(
-        pyfunc=pyfunc, return_type=None, args=args, is_kernel=True, debug=debug
-    )
-    func = cres.library.get_function(cres.fndesc.llvm_func_name)
-    kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
-
-    # A reference to the target context is stored in the Kernel to
-    # reference the context later in code generation. For example, we link
-    # the kernel object with a spir_func defining atomic operations only
-    # when atomic operations are used in the kernel.
-    oclkern = Kernel(
-        context=cres.target_context,
-        sycl_queue=sycl_queue,
-        llvm_module=kernel.module,
-        name=kernel.name,
-        argtypes=cres.signature.args,
-        ordered_arg_access_types=access_types,
-    )
-    return oclkern
-
-
-def compile_kernel_parfor(
-    sycl_queue, func_ir, args, args_with_addrspaces, debug=None
-):
-    # We only accept numba_dpex.core.types.Array type
-    for arg in args_with_addrspaces:
-        if isinstance(arg, types.npytypes.Array) and not isinstance(arg, Array):
-            raise TypeError(
-                "Only numba_dpex.core.types.Array objects are supported as "
-                + "kernel arguments. Received %s" % (type(arg))
-            )
-    if config.DEBUG:
-        print("compile_kernel_parfor", args)
-        for a in args_with_addrspaces:
-            print(a, type(a))
-            if isinstance(a, types.npytypes.Array):
-                print("addrspace:", a.addrspace)
-
-    cres = compile_with_depx(
-        pyfunc=func_ir,
-        return_type=None,
-        args=args_with_addrspaces,
-        is_kernel=True,
-        debug=debug,
-    )
-    func = cres.library.get_function(cres.fndesc.llvm_func_name)
-
-    if config.DEBUG:
-        print("compile_kernel_parfor signature", cres.signature.args)
-        for a in cres.signature.args:
-            print(a, type(a))
-
-    kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
-    oclkern = Kernel(
-        context=cres.target_context,
-        sycl_queue=sycl_queue,
-        llvm_module=kernel.module,
-        name=kernel.name,
-        argtypes=args_with_addrspaces,
-    )
-
-    return oclkern
-
-
-def _ensure_valid_work_item_grid(val, sycl_queue):
-
-    if not isinstance(val, (tuple, list, int)):
-        error_message = (
-            "Cannot create work item dimension from provided argument"
-        )
-        raise ValueError(error_message)
-
-    if isinstance(val, int):
-        val = [val]
-
-    # TODO: we need some way to check the max dimensions
-    """
-    if len(val) > device_env.get_max_work_item_dims():
-        error_message = ("Unsupported number of work item dimensions ")
-        raise ValueError(error_message)
-    """
-
-    return list(
-        val[::-1]
-    )  # reversing due to sycl and opencl interop kernel range mismatch semantic
-
-
-def _ensure_valid_work_group_size(val, work_item_grid):
-
-    if not isinstance(val, (tuple, list, int)):
-        error_message = (
-            "Cannot create work item dimension from provided argument"
-        )
-        raise ValueError(error_message)
-
-    if isinstance(val, int):
-        val = [val]
-
-    if len(val) != len(work_item_grid):
-        error_message = (
-            "Unsupported number of work item dimensions, "
-            + "dimensions of global and local work items has to be the same "
-        )
-        raise ValueError(error_message)
-
-    return list(
-        val[::-1]
-    )  # reversing due to sycl and opencl interop kernel range mismatch semantic
-
-
-class KernelBase(object):
-    """Define interface for configurable kernels"""
-
-    def __init__(self):
-        self.global_size = []
-        self.local_size = []
-        self.sycl_queue = None
-
-        # list of supported access types, stored in dict for fast lookup
-        self.valid_access_types = {
-            _RO_KERNEL_ARG: _RO_KERNEL_ARG,
-            _WO_KERNEL_ARG: _WO_KERNEL_ARG,
-            _RW_KERNEL_ARG: _RW_KERNEL_ARG,
-        }
-
-    def copy(self):
-        return copy.copy(self)
-
-    def configure(self, sycl_queue, global_size, local_size=None):
-        """Configure the OpenCL kernel. The local_size can be None"""
-        clone = self.copy()
-        clone.global_size = global_size
-        clone.local_size = local_size
-        clone.sycl_queue = sycl_queue
-
-        return clone
-
-    def __getitem__(self, args):
-        """Mimick CUDA python's square-bracket notation for configuration.
-        This assumes the argument to be:
-            `global size, local size`
-        """
-        ls = None
-        nargs = len(args)
-        # Check if the kernel enquing arguments are sane
-        if nargs < 1 or nargs > 2:
-            _raise_invalid_kernel_enqueue_args
-
-        sycl_queue = dpctl.get_current_queue()
-
-        gs = _ensure_valid_work_item_grid(args[0], sycl_queue)
-        # If the optional local size argument is provided
-        if nargs == 2 and args[1] != []:
-            ls = _ensure_valid_work_group_size(args[1], gs)
-
-        return self.configure(sycl_queue, gs, ls)
-
-
-class Kernel(KernelBase):
-    """
-    A OCL kernel object
-    """
-
-    def __init__(
-        self,
-        context,
-        sycl_queue,
-        llvm_module,
-        name,
-        argtypes,
-        ordered_arg_access_types=None,
-    ):
-        super(Kernel, self).__init__()
-        self._llvm_module = llvm_module
-        self.assembly = self.binary = llvm_module.__str__()
-        self.entry_name = name
-        self.argument_types = tuple(argtypes)
-        self.ordered_arg_access_types = ordered_arg_access_types
-        self._argloc = []
-        self.sycl_queue = sycl_queue
-        self.context = context
-
-        dpctl_create_program_from_spirv_flags = []
-        # First-time compilation using SPIRV-Tools
-        if config.DEBUG:
-            with open("llvm_kernel.ll", "w") as f:
-                f.write(self.binary)
-
-        if config.DEBUG or config.OPT == 0:
-            # if debug is ON we need to pass additional
-            # flags to igc.
-            dpctl_create_program_from_spirv_flags = ["-g", "-cl-opt-disable"]
-
-        self.spirv_bc = spirv_generator.llvm_to_spirv(
-            self.context, self.assembly, self._llvm_module.as_bitcode()
-        )
-
-        # create a program
-        self.program = dpctl_prog.create_program_from_spirv(
-            self.sycl_queue,
-            self.spirv_bc,
-            " ".join(dpctl_create_program_from_spirv_flags),
-        )
-        #  create a kernel
-        self.kernel = self.program.get_sycl_kernel(self.entry_name)
-
-    def __call__(self, *args):
-        """
-        Create a list of the kernel arguments by unpacking pyobject values
-        into ctypes values.
-        """
-
-        kernelargs = []
-        internal_device_arrs = []
-        for ty, val, access_type in zip(
-            self.argument_types, args, self.ordered_arg_access_types
-        ):
-            self._unpack_argument(
-                ty,
-                val,
-                self.sycl_queue,
-                kernelargs,
-                internal_device_arrs,
-                access_type,
-            )
-
-        self.sycl_queue.submit(
-            self.kernel, kernelargs, self.global_size, self.local_size
-        )
-        self.sycl_queue.wait()
-
-        for ty, val, i_dev_arr, access_type in zip(
-            self.argument_types,
-            args,
-            internal_device_arrs,
-            self.ordered_arg_access_types,
-        ):
-            self._pack_argument(
-                ty, val, self.sycl_queue, i_dev_arr, access_type
-            )
-
-    def _pack_argument(self, ty, val, sycl_queue, device_arr, access_type):
-        """
-        Copy device data back to host
-        """
-        if device_arr and (
-            access_type not in self.valid_access_types
-            or access_type in self.valid_access_types
-            and self.valid_access_types[access_type] != _RO_KERNEL_ARG
-        ):
-            # We copy the data back from usm allocated data
-            # container to original data container.
-            usm_mem, orig_ndarr, packed_ndarr, packed = device_arr
-            copy_to_numpy_from_usm_obj(usm_mem, packed_ndarr)
-            if packed:
-                np.copyto(orig_ndarr, packed_ndarr)
-
-    def _unpack_device_array_argument(
-        self, size, itemsize, buf, shape, strides, ndim, kernelargs
-    ):
-        """
-        Implements the unpacking logic for array arguments.
-
-        Args:
-            size: Total number of elements in the array.
-            itemsize: Size in bytes of each element in the array.
-            buf: The pointer to the memory.
-            shape: The shape of the array.
-            ndim: Number of dimension.
-            kernelargs: Array where the arguments of the kernel is stored.
-        """
-        # meminfo
-        kernelargs.append(ctypes.c_size_t(0))
-        # parent
-        kernelargs.append(ctypes.c_size_t(0))
-        kernelargs.append(ctypes.c_longlong(size))
-        kernelargs.append(ctypes.c_longlong(itemsize))
-        kernelargs.append(buf)
-        for ax in range(ndim):
-            kernelargs.append(ctypes.c_longlong(shape[ax]))
-        for ax in range(ndim):
-            kernelargs.append(ctypes.c_longlong(strides[ax]))
-
-    def _unpack_USMNdArray(self, val, kernelargs):
-        (
-            usm_mem,
-            total_size,
-            shape,
-            ndim,
-            itemsize,
-            strides,
-            dtype,
-        ) = get_info_from_suai(val)
-
-        self._unpack_device_array_argument(
-            total_size,
-            itemsize,
-            usm_mem,
-            shape,
-            strides,
-            ndim,
-            kernelargs,
-        )
-
-    def _unpack_Array(
-        self, val, sycl_queue, kernelargs, device_arrs, access_type
-    ):
-        packed_val = val
-        usm_mem = has_usm_memory(val)
-        if usm_mem is None:
-            default_behavior = self.check_for_invalid_access_type(access_type)
-            usm_mem = as_usm_obj(val, queue=sycl_queue, copy=False)
-
-            orig_val = val
-            packed = False
-            if not val.flags.c_contiguous:
-                # If the numpy.ndarray is not C-contiguous
-                # we pack the strided array into a packed array.
-                # This allows us to treat the data from here on as C-contiguous.
-                # While packing we treat the data as C-contiguous.
-                # We store the reference of both (strided and packed)
-                # array and during unpacking we use numpy.copyto() to copy
-                # the data back from the packed temporary array to the
-                # original strided array.
-                packed_val = val.flatten(order="C")
-                packed = True
-
-            if (
-                default_behavior
-                or self.valid_access_types[access_type] == _RO_KERNEL_ARG
-                or self.valid_access_types[access_type] == _RW_KERNEL_ARG
-            ):
-                copy_from_numpy_to_usm_obj(usm_mem, packed_val)
-
-            device_arrs[-1] = (usm_mem, orig_val, packed_val, packed)
-
-        self._unpack_device_array_argument(
-            packed_val.size,
-            packed_val.dtype.itemsize,
-            usm_mem,
-            packed_val.shape,
-            packed_val.strides,
-            packed_val.ndim,
-            kernelargs,
-        )
-
-    def _unpack_argument(
-        self, ty, val, sycl_queue, kernelargs, device_arrs, access_type
-    ):
-        """
-        Unpacks the arguments that are to be passed to the SYCL kernel from
-        Numba types to Ctypes.
-
-        Args:
-            ty: The data types of the kernel argument defined as in instance of
-                numba.types.
-            val: The value of the kernel argument.
-            sycl_queue (dpctl.SyclQueue): A ``dpctl.SyclQueue`` object. The
-                queue object will be used whenever USM memory allocation is
-                needed during unpacking of an numpy.ndarray argument.
-            kernelargs (list): The list of kernel arguments into which the
-                current kernel argument will be appended.
-            device_arrs (list): A list of tuples that is used to store the
-                triples corresponding to the USM memorry allocated for an
-                ``numpy.ndarray`` argument, a wrapper ``ndarray`` created from
-                the USM memory, and the original ``ndarray`` argument.
-            access_type : The type of access for an array argument.
-
-        Raises:
-            NotImplementedError: If the type of argument is not yet supported,
-                then a ``NotImplementedError`` is raised.
-
-        """
-
-        device_arrs.append(None)
-
-        if isinstance(ty, USMNdArray):
-            self._unpack_USMNdArray(val, kernelargs)
-        elif isinstance(ty, types.Array):
-            self._unpack_Array(
-                val, sycl_queue, kernelargs, device_arrs, access_type
-            )
-        elif ty == types.int64:
-            cval = ctypes.c_longlong(val)
-            kernelargs.append(cval)
-        elif ty == types.uint64:
-            cval = ctypes.c_ulonglong(val)
-            kernelargs.append(cval)
-        elif ty == types.int32:
-            cval = ctypes.c_int(val)
-            kernelargs.append(cval)
-        elif ty == types.uint32:
-            cval = ctypes.c_uint(val)
-            kernelargs.append(cval)
-        elif ty == types.float64:
-            cval = ctypes.c_double(val)
-            kernelargs.append(cval)
-        elif ty == types.float32:
-            cval = ctypes.c_float(val)
-            kernelargs.append(cval)
-        elif ty == types.boolean:
-            cval = ctypes.c_uint8(int(val))
-            kernelargs.append(cval)
-        elif ty == types.complex64:
-            raise NotImplementedError(ty, val)
-        elif ty == types.complex128:
-            raise NotImplementedError(ty, val)
-        else:
-            raise NotImplementedError(ty, val)
-
-    def check_for_invalid_access_type(self, access_type):
-        if access_type not in self.valid_access_types:
-            msg = (
-                "[!] %s is not a valid access type. "
-                "Supported access types are [" % (access_type)
-            )
-            for key in self.valid_access_types:
-                msg += " %s |" % (key)
-
-            msg = msg[:-1] + "]"
-            if access_type is not None:
-                print(msg)
-            return True
-        else:
-            return False
-
-
-class JitKernel(KernelBase):
-    def __init__(self, func, debug, access_types):
-
-        super(JitKernel, self).__init__()
-
-        self.py_func = func
-        self.definitions = {}
-        self.debug = debug
-        self.access_types = access_types
-
-        from .core.descriptor import dpex_target
-
-        self.typingctx = dpex_target.typing_context
-
-    def _get_argtypes(self, *args):
-        """
-        Convenience function to get the type of each argument.
-        """
-        return tuple([self.typingctx.resolve_argument_type(a) for a in args])
-
-    def _datatype_is_same(self, argtypes):
-        """
-        This function will determine if there is any argument of type array and
-        in case there are multiple array types if they are all of the same type.
-
-        Args:
-            argtypes: Numba type for each argument passed to a JitKernel.
-
-        Returns:
-            array_type: None if there are no argument of type array, or the
-                        Numba type in case there is array type argument.
-            bool: True if no array type arguments or if all array type arguments
-                  are of same Numba type, False otherwise.
-
-        """
-        array_type = None
-        for i, argtype in enumerate(argtypes):
-            arg_is_array_type = isinstance(argtype, USMNdArray) or isinstance(
-                argtype, types.Array
-            )
-            if array_type is None and arg_is_array_type:
-                array_type = argtype
-            elif (
-                array_type is not None
-                and arg_is_array_type
-                and type(argtype) is not type(array_type)
-            ):
-                return None, False
-        return array_type, True
-
-    def __call__(self, *args, **kwargs):
-        assert not kwargs, "Keyword Arguments are not supported"
-
-        argtypes = self._get_argtypes(*args)
-        compute_queue = None
-
-        # Get the array type and whether all array are of same type or not
-        array_type, uniform = self._datatype_is_same(argtypes)
-        if not uniform:
-            _raise_datatype_mixed_error(argtypes)
-
-        if type(array_type) == USMNdArray:
-            if dpctl.is_in_device_context():
-                warnings.warn(cfd_ctx_mgr_wrng_msg)
-
-            queues = []
-            for i, argtype in enumerate(argtypes):
-                if type(argtype) == USMNdArray:
-                    memory = dpctl.memory.as_usm_memory(args[i])
-                    if dpctl_version < (0, 12):
-                        queue = memory._queue
-                    else:
-                        queue = memory.sycl_queue
-                    queues.append(queue)
-
-            # dpctl.utils.get_exeuction_queue() checks if the queues passed are
-            # equivalent and returns a SYCL queue if they are equivalent and
-            # None if they are not.
-            compute_queue = dpctl.utils.get_execution_queue(queues)
-            if compute_queue is None:
-                raise IndeterminateExecutionQueueError(
-                    "Data passed as argument are not equivalent. Please "
-                    "create dpctl.tensor.usm_ndarray with equivalent SYCL queue."
-                )
-
-        if compute_queue is None:
-            try:
-                compute_queue = dpctl.get_current_queue()
-            except:
-                _raise_no_device_found_error()
-
-        kernel = self.specialize(argtypes, compute_queue)
-        cfg = kernel.configure(
-            kernel.sycl_queue, self.global_size, self.local_size
-        )
-        cfg(*args)
-
-    def specialize(self, argtypes, queue):
-        # We specialize for argtypes and queue. These two are used as key for
-        # caching as well.
-        assert queue is not None
-
-        sycl_ctx = None
-        kernel = None
-        # we were previously using the _env_ptr of the device_env, the sycl_queue
-        # should be sufficient to cache the compiled kernel for now, but we should
-        # use the device type to cache such kernels.
-        key_definitions = argtypes
-        result = self.definitions.get(key_definitions)
-        if result:
-            sycl_ctx, kernel = result
-
-        if sycl_ctx and sycl_ctx == queue.sycl_context:
-            return kernel
-        else:
-            kernel = compile_kernel(
-                queue, self.py_func, argtypes, self.access_types, self.debug
-            )
-            self.definitions[key_definitions] = (queue.sycl_context, kernel)
-        return kernel
diff --git a/numba_dpex/tests/test_no_copy_usm_shared.py b/numba_dpex/tests/test_no_copy_usm_shared.py
index e4ced748c8..3d0ab97539 100644
--- a/numba_dpex/tests/test_no_copy_usm_shared.py
+++ b/numba_dpex/tests/test_no_copy_usm_shared.py
@@ -10,7 +10,7 @@
 from numba.core import compiler, cpu
 from numba.core.registry import cpu_target
 
-from numba_dpex.compiler import Compiler
+from numba_dpex.core.compiler import Compiler
 from numba_dpex.tests._helper import skip_no_opencl_gpu
 
 

From 1d3a5291d3b5084a78a2ce31a0ec747b09d9ed95 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Mon, 9 Jan 2023 23:40:04 -0600
Subject: [PATCH 38/51] Remove the temporary driver.py file.

---
 driver.py | 39 ---------------------------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 driver.py

diff --git a/driver.py b/driver.py
deleted file mode 100644
index d31024fdaa..0000000000
--- a/driver.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#! /usr/bin/env python
-
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-import dpctl.tensor as dpt
-
-import numba_dpex as dpex
-from numba_dpex import int64, usm_ndarray
-from numba_dpex.core.kernel_interface.dispatcher import JitKernel
-
-arrty = usm_ndarray(int64, 1, "C", "device", "level_zero:gpu:0")
-
-
-@dpex.kernel((arrty, arrty, arrty))
-def data_parallel_sum(a, b, c):
-    """
-    Vector addition using the ``kernel`` decorator.
-    """
-    i = dpex.get_global_id(0)
-    c[i] = a[i] + b[i]
-
-
-def main():
-    a = dpt.arange(0, 100, device="level_zero:gpu:0")
-    b = dpt.arange(0, 100, device="level_zero:gpu:0")
-    c = dpt.zeros_like(a, device="level_zero:gpu:0")
-
-    # d = Dispatcher(pyfunc=data_parallel_sum)
-    # d(a, b, c, global_range=[100])
-    data_parallel_sum[(100,)](a, b, c)
-    print(dpt.asnumpy(a))
-    print(dpt.asnumpy(b))
-    print(dpt.asnumpy(c))
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()

From 020da9eeed28253bef096f8a17f88792aadd3489 Mon Sep 17 00:00:00 2001
From: "Wang, Mingjie1" <mingjie1.wang@intel.com>
Date: Mon, 9 Jan 2023 14:10:13 -0600
Subject: [PATCH 39/51] Added ndarray setup check.

---
 numba_dpex/core/exceptions.py                 | 27 ++++++++++++
 .../core/kernel_interface/dispatcher.py       | 42 +++++++++++++++----
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index 558fd141b0..3aefd4f489 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -108,6 +108,16 @@ def __init__(self, kernel_name, ndims, max_work_item_dims) -> None:
         super().__init__(self.message)
 
 
+class UnmatchedNumberOfRangeDimsError(Exception):
+    def __init__(self, kernel_name, global_ndims, local_ndims) -> None:
+        self.message = (
+            f"Specified global_range for kernel {kernel_name} has {global_ndims} dimensions, "
+            f"while specified local_range with dimenstions of {local_ndims} doesn't match "
+            "with global_range."
+        )
+        super().__init__(self.message)
+
+
 class UnsupportedWorkItemSizeError(Exception):
     """
 
@@ -127,6 +137,23 @@ def __init__(
         super().__init__(self.message)
 
 
+class UnsupportedGroupWorkItemSizeError(Exception):
+    """
+
+    Args:
+        Exception (_type_): _description_
+    """
+
+    def __init__(self, kernel_name, dim, work_groups, work_items) -> None:
+        self.message = (
+            f"Attempting to launch kernel {kernel_name} with "
+            f"{work_groups} global work groups and {work_items} local work items "
+            f"in dimension {dim} is not supported. The global work groups must be "
+            f"able to divide local work items evenly."
+        )
+        super().__init__(self.message)
+
+
 class ComputeFollowsDataInferenceError(Exception):
     """Exception raised when an execution queue for a given array expression or
     a kernel function could not be deduced using the compute-follows-data
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index bb91dcd399..d9db893e20 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -25,7 +25,9 @@
     KernelHasReturnValueError,
     MissingSpecializationError,
     UnknownGlobalRangeError,
+    UnmatchedNumberOfRangeDimsError,
     UnsupportedBackendError,
+    UnsupportedGroupWorkItemSizeError,
     UnsupportedNumberOfRangeDimsError,
     UnsupportedWorkItemSizeError,
 )
@@ -227,6 +229,16 @@ def _specialize(self, sig):
             cache=self._specialization_cache,
         )
 
+    def _check_size(self, dim, size, size_limit):
+
+        if size > size_limit:
+            raise UnsupportedWorkItemSizeError(
+                kernel_name=self.kernel_name,
+                dim=dim,
+                requested_work_items=size,
+                supported_work_items=size_limit,
+            )
+
     def _check_range(self, range, device):
 
         if not isinstance(range, (tuple, list)):
@@ -242,15 +254,26 @@ def _check_range(self, range, device):
             )
 
     def _check_ndrange(self, global_range, local_range, device):
-        # for dim, size in enumerate(val):
-        #     if val[dim] > work_item_sizes[dim]:
-        #         raise UnsupportedWorkItemSizeError(
-        #             kernel_name=self.kernel_name,
-        #             dim=dim,
-        #             requested_work_items=val[dim],
-        #             supported_work_items=work_item_sizes[dim],
-        #         )
-        pass
+
+        self._check_range(local_range, device)
+
+        self._check_range(global_range, device)
+        if len(local_range) != len(global_range):
+            raise UnmatchedNumberOfRangeDimsError(
+                kernel_name=self.kernel_name,
+                global_ndims=len(global_range),
+                local_ndims=len(local_range),
+            )
+
+        for i in range(len(global_range)):
+            self._check_size(i, local_range[i], device.max_work_item_sizes[i])
+            if global_range[i] % local_range[i] != 0:
+                raise UnsupportedGroupWorkItemSizeError(
+                    kernel_name=self.kernel_name,
+                    dim=i,
+                    work_groups=global_range[i],
+                    work_items=local_range[i],
+                )
 
     def _chk_compute_follows_data_compliance(self, usm_array_arglist):
         """Check if all the usm ndarray's have the same device.
@@ -536,6 +559,7 @@ def _get_ranges(self, global_range, local_range, device):
         # N is one, two or three.
         # If both local and global range values are specified the kernel is
         # invoked using a SYCL nd_range
+
         if global_range and not local_range:
             self._check_range(global_range, device)
             # FIXME:[::-1] is done as OpenCL and SYCl have different orders when

From 12fa8a3d087e775fc372e1de2d60ff1a7fb6a3f2 Mon Sep 17 00:00:00 2001
From: "Wang, Mingjie1" <mingjie1.wang@intel.com>
Date: Mon, 9 Jan 2023 20:14:44 -0600
Subject: [PATCH 40/51] Added tests for ndrange exceptions.

---
 .../kernel_tests/test_ndrange_exceptions.py   | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py

diff --git a/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py b/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py
new file mode 100644
index 0000000000..90c3aa4499
--- /dev/null
+++ b/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+import dpnp
+import pytest
+
+import numba_dpex as ndpx
+from numba_dpex.core.exceptions import (
+    UnmatchedNumberOfRangeDimsError,
+    UnsupportedGroupWorkItemSizeError,
+)
+
+
+# Data parallel kernel implementing vector sum
+@ndpx.kernel
+def kernel_vector_sum(a, b, c):
+    i = ndpx.get_global_id(0)
+    c[i] = a[i] + b[i]
+
+
+@pytest.mark.parametrize(
+    "error, ndrange",
+    [
+        (UnmatchedNumberOfRangeDimsError, ((2, 2), (1, 1, 1))),
+        (UnsupportedGroupWorkItemSizeError, ((3, 3, 3), (2, 2, 2))),
+    ],
+)
+def test_ndrange_config_error(error, ndrange):
+    """Test if a exception is raised when calling a
+    ndrange kernel with unspported arguments.
+    """
+    N = 10
+
+    a = dpnp.random.random(N)
+    b = dpnp.random.random(N)
+    c = dpnp.ones_like(a)
+
+    with pytest.raises(error):
+        kernel_vector_sum[ndrange](a, b, c)

From f341ab26c52bb6ae6bf963b0b05ab738fb26e829 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 10 Jan 2023 00:07:47 -0600
Subject: [PATCH 41/51] Add an example for aot kernel specialization.

---
 .../examples/kernel/aot_specialization.py     | 134 ++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 numba_dpex/examples/kernel/aot_specialization.py

diff --git a/numba_dpex/examples/kernel/aot_specialization.py b/numba_dpex/examples/kernel/aot_specialization.py
new file mode 100644
index 0000000000..0c4ab71ad3
--- /dev/null
+++ b/numba_dpex/examples/kernel/aot_specialization.py
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl.tensor as dpt
+import numpy as np
+import pytest
+
+import numba_dpex as dpex
+from numba_dpex import float32, int64, usm_ndarray
+from numba_dpex.core.exceptions import (
+    InvalidKernelSpecializationError,
+    MissingSpecializationError,
+)
+
+# Similar to Numba, numba-dpex supports ahead-of-time (AOT) compilation of
+# functions. The following examples demonstrate the feature for
+# numba_dpex.kernel and presents usage scenarios and current limitations.
+
+# ------------                 AOT Example 1.                   ------------ #
+
+# Define type specializations using the numba_dpex usm_ndarray data type.
+i64arrty = usm_ndarray(int64, 1, "C", usm_type="device", device="0")
+f32arrty = usm_ndarray(float32, 1, "C", usm_type="device", device="0")
+
+
+# specialize a kernel for the i64arrty
+@dpex.kernel((i64arrty, i64arrty, i64arrty))
+def data_parallel_sum(a, b, c):
+    """
+    Vector addition using the ``kernel`` decorator.
+    """
+    i = dpex.get_global_id(0)
+    c[i] = a[i] + b[i]
+
+
+# run the specialized kernel
+a = dpt.ones(1024, dtype=dpt.int64, device="0")
+b = dpt.ones(1024, dtype=dpt.int64, device="0")
+c = dpt.zeros(1024, dtype=dpt.int64, device="0")
+
+data_parallel_sum[
+    1024,
+](a, b, c)
+
+npc = dpt.asnumpy(c)
+npc_expected = np.full(1024, 2, dtype=np.int64)
+assert np.array_equal(npc, npc_expected)
+
+
+# ------------                 AOT Example 2.                   ------------ #
+
+# Multiple signatures can be specified as a list to AOT compile multiple
+# versions of the kernel.
+
+# specialize a kernel for the i64arrty
+@dpex.kernel([(i64arrty, i64arrty, i64arrty), (f32arrty, f32arrty, f32arrty)])
+def data_parallel_sum2(a, b, c):
+    """
+    Vector addition using the ``kernel`` decorator.
+    """
+    i = dpex.get_global_id(0)
+    c[i] = a[i] + b[i]
+
+
+# run the i64 specialized kernel
+a = dpt.ones(1024, dtype=dpt.int64, device="0")
+b = dpt.ones(1024, dtype=dpt.int64, device="0")
+c = dpt.zeros(1024, dtype=dpt.int64, device="0")
+
+data_parallel_sum2[
+    1024,
+](a, b, c)
+
+npc = dpt.asnumpy(c)
+npc_expected = np.full(1024, 2, dtype=np.int64)
+assert np.array_equal(npc, npc_expected)
+
+# run the f32 specialized kernel
+a = dpt.ones(1024, dtype=dpt.float32, device="0")
+b = dpt.ones(1024, dtype=dpt.float32, device="0")
+c = dpt.zeros(1024, dtype=dpt.float32, device="0")
+
+data_parallel_sum2[
+    1024,
+](a, b, c)
+
+npc = dpt.asnumpy(c)
+npc_expected = np.full(1024, 2, dtype=np.float32)
+assert np.array_equal(npc, npc_expected)
+
+
+# ------------                 AOT Example 3.                   ------------ #
+
+# AOT specialized kernels cannot be jit compiled. Calling a specialized kernel
+# with arguments having type different from the specialization will result in
+# an MissingSpecializationError.
+
+a = dpt.ones(1024, dtype=dpt.int32)
+b = dpt.ones(1024, dtype=dpt.int32)
+c = dpt.zeros(1024, dtype=dpt.int32)
+
+try:
+    data_parallel_sum[
+        1024,
+    ](a, b, c)
+except MissingSpecializationError as mse:
+    print(mse)
+
+
+# ------------                 AOT Example 4.                   ------------ #
+
+# Numba_dpex does not support NumPy arrays as kernel arguments and all
+# array arguments should be inferable as a numba_dpex.types.usm_ndarray. Trying
+# to AOT with a NumPy array-based signature will lead to an
+# InvalidKernelSpecializationError
+
+try:
+    dpex.kernel((int64[::1], int64[::1], int64[::1]))
+except InvalidKernelSpecializationError as ikse:
+    print(ikse)
+
+
+# ------------                 Limitations                       ------------ #
+
+
+# Specifying signatures using strings is not yet supported. The limitation is
+# due to numba_dpex relying on Numba's sigutils module to parse signatures.
+# Sigutils only recognizes Numba types specified as strings.
+
+try:
+    dpex.kernel("(i64arrty, i64arrty, i64arrty)")
+except NotImplementedError as nie:
+    print(nie)

From d275e1d3923ad2f7acd1b18663c8c8a25c21d21e Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 10 Jan 2023 00:14:40 -0600
Subject: [PATCH 42/51] Fix deprecation warnings.

---
 numba_dpex/core/kernel_interface/dispatcher.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index d9db893e20..3e49a25cbb 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -471,13 +471,13 @@ def __getitem__(self, args):
             global_range and local_range attributes initialized.
 
         .. deprecated:: 0.19
-            Use :func:`KernelLauncher.execute` instead.
         """
 
         warn(
             "The [] (__getitem__) method to set global and local ranges for "
             + "launching a kernel is deprecated. "
-            + "Use the execute function instead.",
+            + 'Set the "global_range" and the "local_range" keyword '
+            + "arguments when calling the kernel instead.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -518,8 +518,8 @@ def _get_ranges(self, global_range, local_range, device):
             if self._global_range:
                 warn(
                     "Use of __getitem__ to set the global_range attribute is "
-                    + 'deprecated. Use the keyword argument "global_range" of '
-                    + "__call__ method to set the attribute."
+                    + 'deprecated. Use the keyword argument "global_range" '
+                    + "when calling the kernel to specify the global range."
                 )
                 global_range = self._global_range
             else:
@@ -535,8 +535,8 @@ def _get_ranges(self, global_range, local_range, device):
             if self._local_range:
                 warn(
                     "Use of __getitem__ to set the local_range attribute is "
-                    + 'deprecated. Use the keyword argument "local_range" of '
-                    + "__call__ method to set the attribute."
+                    + 'deprecated. Use the keyword argument "local_range" '
+                    + "when calling the kernel to specify the local range."
                 )
                 local_range = self._local_range
             else:
@@ -548,7 +548,7 @@ def _get_ranges(self, global_range, local_range, device):
                     + "Consider setting the local range value for the kernel "
                     + "execution.\n"
                     + "The local_range keyword may be made a required argument "
-                    + "in the future."
+                    + "in the future when calling a kernel."
                 )
 
         if isinstance(global_range, int):

From 8150a96acc4845ffc3868d6257cf896cd7cc287d Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Wed, 11 Jan 2023 13:12:08 -0600
Subject: [PATCH 43/51] Add docsstrings.

---
 numba_dpex/core/kernel_interface/__init__.py  |  2 +-
 .../kernel_interface/arg_pack_unpacker.py     | 31 +++++++----
 .../core/kernel_interface/dispatcher.py       | 52 ++++++++++++++-----
 numba_dpex/core/utils/suai_helper.py          |  7 +--
 .../examples/kernel/aot_specialization.py     | 11 ++--
 5 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/numba_dpex/core/kernel_interface/__init__.py b/numba_dpex/core/kernel_interface/__init__.py
index 5557024c06..21f8040397 100644
--- a/numba_dpex/core/kernel_interface/__init__.py
+++ b/numba_dpex/core/kernel_interface/__init__.py
@@ -2,5 +2,5 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Defines the Kernel classes that abstract a SYCL device kernel.
+"""Defines the interface for kernel compilation using numba-dpex.
 """
diff --git a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
index f734b02f97..2e6d13bb3f 100644
--- a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
+++ b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py
@@ -11,12 +11,11 @@
 
 import numba_dpex.utils as utils
 from numba_dpex.core.exceptions import (
-    SUAIProtocolError,
     UnsupportedAccessQualifierError,
     UnsupportedKernelArgumentError,
 )
 from numba_dpex.core.types import USMNdArray
-from numba_dpex.core.utils import SyclUSMArrayInterface, get_info_from_suai
+from numba_dpex.core.utils import get_info_from_suai
 
 
 class _NumPyArrayPackerPayload:
@@ -28,6 +27,9 @@ def __init__(self, usm_mem, orig_val, packed_val, packed) -> None:
 
 
 class Packer:
+    """Implements the functionality to unpack a Python object passed as an
+    argument to a numba_dpex kernel fucntion into corresponding ctype object.
+    """
 
     # TODO: Remove after NumPy support is removed
     _access_types = ("read_only", "write_only", "read_write")
@@ -45,8 +47,6 @@ def _unpack_array_helper(self, size, itemsize, buf, shape, strides, ndim):
         """
         Implements the unpacking logic for array arguments.
 
-        TODO: Add more detail
-
         Args:
             size: Total number of elements in the array.
             itemsize: Size in bytes of each element in the array.
@@ -83,7 +83,7 @@ def _unpack_usm_array(self, val):
             val : An object of dpctl.types.UsmNdArray type.
 
         Returns:
-            _type_: _description_
+            list: A list of ctype objects representing the flattened usm_ndarray
         """
         suai_attrs = get_info_from_suai(val)
 
@@ -160,7 +160,7 @@ def _unpack_array(self, val, access_type):
 
     def _unpack_argument(self, ty, val, access_specifier):
         """
-        Unpack a Python object into a ctype value using Numba's
+        Unpack a Python object into one or more ctype values using Numba's
         type-inference machinery.
 
         Args:
@@ -201,7 +201,8 @@ def _unpack_argument(self, ty, val, access_specifier):
 
     def _pack_array(self):
         """
-        Copy device data back to host
+        Deprecated to be removed once NumPy array support in kernels is
+        removed.
         """
         for obj in self._repack_list:
             utils.copy_to_numpy_from_usm_obj(obj._usm_mem, obj._packed_val)
@@ -211,12 +212,18 @@ def _pack_array(self):
     def __init__(
         self, kernel_name, arg_list, argty_list, access_specifiers_list, queue
     ) -> None:
-        """_summary_
+        """Initializes new Packer object and unpacks the input argument list.
 
         Args:
-            arg_list (_type_): _description_
-            argty_list (_type_): _description_
-            queue: _description_
+            arg_list (list): A list of arguments to be unpacked
+            argty_list (list): A list of Numba inferred types for each argument.
+            access_specifiers_list(list): A list of access specifiers for
+            NumPy arrays to optimize host to device memory copy.
+            [Deprecated: can be removed along with NumPy array support]
+            queue (dpctl.SyclQueue): The SYCL queue where the kernel is to be
+            executed. The queue is required to allocate USM memory for NumPy
+            arrays.
+            [Deprecated: can be removed along with NumPy array support]
         """
         self._pyfunc_name = kernel_name
         self._arg_list = arg_list
@@ -241,9 +248,11 @@ def __init__(
 
     @property
     def unpacked_args(self):
+        """Returns the list of unpacked arguments created by a Packer object."""
         return self._unpacked_args
 
     @property
     def repacked_args(self):
+        """Returns the list of NumPy"""
         self._pack_array()
         return self._repack_list
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 3e49a25cbb..808910e35a 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -37,6 +37,7 @@
 
 
 def get_ordered_arg_access_types(pyfunc, access_types):
+    """Deprecated and to be removed in next release."""
     # Construct a list of access type of each arg according to their position
     ordered_arg_access_types = []
     sig = signature(pyfunc, follow_wrapped=False)
@@ -52,7 +53,8 @@ def get_ordered_arg_access_types(pyfunc, access_types):
 
 
 class JitKernel:
-    """An abstract function object wrapping a concrete device kernel function.
+    """Functor to wrap a kernel function and JIT compile and dispatch it to a
+    specified SYCL queue.
 
     A JitKernel is returned by the kernel decorator and wraps an instance of a
     device kernel function. A device kernel function is specialized for a
@@ -140,6 +142,9 @@ def cache_hits(self):
         return self._cache_hits
 
     def _compile_and_cache(self, argtypes, backend, device_type, cache):
+        """Helper function to compile the Python function or Numba FunctionIR
+        object passed to a JitKernel and store it in an internal cache.
+        """
         # We always compile the kernel using the dpex_target.
         typingctx = dpex_target.typing_context
         targetctx = dpex_target.target_context
@@ -168,10 +173,10 @@ def _compile_and_cache(self, argtypes, backend, device_type, cache):
         return device_driver_ir_module, kernel_module_name
 
     def _specialize(self, sig):
-        """Compiles a device kernel ahead of time based on provided argtypes.
+        """Compiles a device kernel ahead of time based on provided signature.
 
         Args:
-            sig (_type_): _description_
+            sig: The signature on which the kernel is to be specialized.
         """
 
         argtypes, return_type = sigutils.normalize_signature(sig)
@@ -230,6 +235,9 @@ def _specialize(self, sig):
         )
 
     def _check_size(self, dim, size, size_limit):
+        """Checks if the range value is sane based on the number of work items
+        supported by the device.
+        """
 
         if size > size_limit:
             raise UnsupportedWorkItemSizeError(
@@ -240,6 +248,11 @@ def _check_size(self, dim, size, size_limit):
             )
 
     def _check_range(self, range, device):
+        """Checks if the requested range to launch the kernel is valid.
+
+        Range is checked against the number of dimensions and if the range
+        argument is specified as a valid list of tuple.
+        """
 
         if not isinstance(range, (tuple, list)):
             raise IllegalRangeValueError(self.kernel_name)
@@ -254,7 +267,9 @@ def _check_range(self, range, device):
             )
 
     def _check_ndrange(self, global_range, local_range, device):
-
+        """Checks if the specified nd_range (global_range, local_range) is
+        legal for a device on which the kernel will be launched.
+        """
         self._check_range(local_range, device)
 
         self._check_range(global_range, device)
@@ -428,8 +443,8 @@ def _determine_kernel_launch_queue(self, args, argtypes):
                 raise ComputeFollowsDataInferenceError(
                     self.kernel_name, usmarray_argnum_list=usmarray_argnums
                 )
-            else:
-                return dpctl.SyclQueue(device)
+
+            return dpctl.SyclQueue(device)
         else:
             if dpctl.is_in_device_context():
                 warn(
@@ -499,14 +514,24 @@ def __getitem__(self, args):
         return copy.copy(self)
 
     def _get_ranges(self, global_range, local_range, device):
-        """_summary_
+        """Helper to get the global and local range values needed to launch a
+        kernel.
+
+        The global and local range arguments can either be provided using the
+        __getitem__ method or as keyword arguments to the __call__ method.
+        The function verifies that the range values are specified using at least
+        one of the method.
 
         Args:
-            global_range (_type_): _description_
-            local_range (_type_): _description_
+            global_range (list or tuple): The global range to be used for kernel
+            launch.
+            local_range (list or tuple): The local range to be used for kernel
+            launch.
+            device (dpctl.SyclDevice): The device on which to launch the kernel.
 
         Raises:
-            UnknownGlobalRangeError: _description_
+            UnknownGlobalRangeError: When no global range was specified for
+            kernel launch.
         """
         if global_range:
             if self._global_range:
@@ -579,11 +604,12 @@ def _get_ranges(self, global_range, local_range, device):
         return (global_range, local_range)
 
     def __call__(self, *args, global_range=None, local_range=None):
-        """_summary_
+        """Functor to launch a kernel.
 
         Args:
-            global_range (_type_): _description_
-            local_range (_type_): _description_.
+            global_range (list or tuple): optional global range for kernel
+            launch.
+            local_range (list or tuple): optional local range for kernel launch.
         """
         argtypes = [self.typingctx.resolve_argument_type(arg) for arg in args]
         # FIXME: For specialized and ahead of time compiled and cached kernels,
diff --git a/numba_dpex/core/utils/suai_helper.py b/numba_dpex/core/utils/suai_helper.py
index ea52bcc0dc..101f1f75c4 100644
--- a/numba_dpex/core/utils/suai_helper.py
+++ b/numba_dpex/core/utils/suai_helper.py
@@ -128,12 +128,7 @@ def get_info_from_suai(obj):
             strides[i - 1] = strides[i] * shape[i]
         strides = tuple(strides)
 
-    syclobj = usm_mem.__sycl_usm_array_interface__["syclobj"]
-    if not isinstance(syclobj, dpctl.SyclQueue):
-        raise ValueError(
-            "dpctl.SyclQueue could not be inferred. "
-            "The __sycl_usm_array_interface__ may be malformed."
-        )
+    syclobj = usm_mem.sycl_queue
     device = syclobj.sycl_device.filter_string
     usm_type = usm_mem.get_usm_type()
 
diff --git a/numba_dpex/examples/kernel/aot_specialization.py b/numba_dpex/examples/kernel/aot_specialization.py
index 0c4ab71ad3..ce0044c088 100644
--- a/numba_dpex/examples/kernel/aot_specialization.py
+++ b/numba_dpex/examples/kernel/aot_specialization.py
@@ -2,9 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
+
 import dpctl.tensor as dpt
 import numpy as np
-import pytest
 
 import numba_dpex as dpex
 from numba_dpex import float32, int64, usm_ndarray
@@ -117,8 +118,8 @@ def data_parallel_sum2(a, b, c):
 
 try:
     dpex.kernel((int64[::1], int64[::1], int64[::1]))
-except InvalidKernelSpecializationError as ikse:
-    print(ikse)
+except InvalidKernelSpecializationError:
+    logging.exception()
 
 
 # ------------                 Limitations                       ------------ #
@@ -130,5 +131,5 @@ def data_parallel_sum2(a, b, c):
 
 try:
     dpex.kernel("(i64arrty, i64arrty, i64arrty)")
-except NotImplementedError as nie:
-    print(nie)
+except NotImplementedError:
+    logging.exception()

From 6ef8212ff186d4160b06514f375b1ac9506826b8 Mon Sep 17 00:00:00 2001
From: "Wang, Mingjie1" <mingjie1.wang@intel.com>
Date: Tue, 10 Jan 2023 13:41:49 -0600
Subject: [PATCH 44/51] Switched to dpctl.tensor in test_ndrange_exceptions.py.

---
 numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py b/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py
index 90c3aa4499..e92fdbea28 100644
--- a/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py
+++ b/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
-import dpnp
+import dpctl.tensor as dpt
 import pytest
 
 import numba_dpex as ndpx
@@ -29,11 +29,10 @@ def test_ndrange_config_error(error, ndrange):
     """Test if a exception is raised when calling a
     ndrange kernel with unspported arguments.
     """
-    N = 10
 
-    a = dpnp.random.random(N)
-    b = dpnp.random.random(N)
-    c = dpnp.ones_like(a)
+    a = dpt.ones(1024, dtype=dpt.int32, device="0")
+    b = dpt.ones(1024, dtype=dpt.int32, device="0")
+    c = dpt.zeros(1024, dtype=dpt.int64, device="0")
 
     with pytest.raises(error):
         kernel_vector_sum[ndrange](a, b, c)

From 40d85fb300bd0575cc287ddf6a8627b4fcfc4d5a Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Wed, 11 Jan 2023 17:45:18 -0600
Subject: [PATCH 45/51] Changes to compute-follows-data implementation and
 specialization.

   - The compute follows data checking is now based on queue
     equality.
   - USMNdArray no longer requires usm_type and device
     during construction. It allows us to specialize an usm_ndarray
     only on ndims, layout and dtype.
   - No check for compute follows data for eager compilation.
   - Change caching to not require backend and device-type.
   - Fixes to test cases.
---
 numba_dpex/core/caching.py                    | 10 ++++
 .../core/kernel_interface/dispatcher.py       | 49 ++++---------------
 numba_dpex/core/typeconv/array_conversion.py  |  1 +
 numba_dpex/core/types/usm_ndarray_type.py     | 14 ++++--
 numba_dpex/core/typing/typeof.py              |  1 +
 numba_dpex/core/utils/suai_helper.py          |  7 +++
 numba_dpex/examples/kernel/device_func.py     |  3 +-
 numba_dpex/tests/kernel_tests/test_barrier.py |  8 +--
 .../test_kernel_has_return_value_error.py     |  2 +-
 .../test_kernel_specialization.py             | 10 ++--
 10 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/numba_dpex/core/caching.py b/numba_dpex/core/caching.py
index 3d011787c3..1c46485c6c 100644
--- a/numba_dpex/core/caching.py
+++ b/numba_dpex/core/caching.py
@@ -10,6 +10,7 @@
 from numba.core.serialize import dumps
 
 from numba_dpex import config
+from numba_dpex.core.types import USMNdArray
 
 
 def build_key(argtypes, pyfunc, codegen, backend=None, device_type=None):
@@ -49,6 +50,15 @@ def build_key(argtypes, pyfunc, codegen, backend=None, device_type=None):
     else:
         cvarbytes = b""
 
+    argtylist = list(argtypes)
+    for i, argty in enumerate(argtylist):
+        if isinstance(argty, USMNdArray):
+            # Convert the USMNdArray to an abridged type that disregards the
+            # usm_type, device, queue, address space attributes.
+            argtylist[i] = (argty.ndim, argty.dtype, argty.layout)
+
+    argtypes = tuple(argtylist)
+
     return (
         argtypes,
         codegen.magic_tuple(),
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 808910e35a..8fc00e65c3 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -141,7 +141,7 @@ def cache(self):
     def cache_hits(self):
         return self._cache_hits
 
-    def _compile_and_cache(self, argtypes, backend, device_type, cache):
+    def _compile_and_cache(self, argtypes, cache):
         """Helper function to compile the Python function or Numba FunctionIR
         object passed to a JitKernel and store it in an internal cache.
         """
@@ -165,8 +165,6 @@ def _compile_and_cache(self, argtypes, backend, device_type, cache):
             tuple(argtypes),
             self.pyfunc,
             kernel.target_context.codegen(),
-            backend=backend,
-            device_type=device_type,
         )
         cache.put(key, (device_driver_ir_module, kernel_module_name))
 
@@ -212,25 +210,8 @@ def _specialize(self, sig):
                 unsupported_argnum_list=unsupported_argnum_list,
             )
 
-        # CFD check and get the execution queue
-        device = self._chk_compute_follows_data_compliance(usmndarray_argtypes)
-        if not device:
-            raise ComputeFollowsDataInferenceError(
-                self.kernel_name, usmarray_argnum_list=usmarray_argnums
-            )
-
-        if device.backend not in [
-            dpctl.backend_type.opencl,
-            dpctl.backend_type.level_zero,
-        ]:
-            raise UnsupportedBackendError(
-                self.kernel_name, device.backend, JitKernel._supported_backends
-            )
-        # compile and cache the kernel
         self._compile_and_cache(
             argtypes=argtypes,
-            backend=device.backend,
-            device_type=device.device_type,
             cache=self._specialization_cache,
         )
 
@@ -310,22 +291,17 @@ def _chk_compute_follows_data_compliance(self, usm_array_arglist):
             else None is returned.
         """
 
-        device = None
+        queue = None
 
         for usm_array in usm_array_arglist:
-            filter_str = usm_array.device
-            try:
-                _device = dpctl.SyclDevice(filter_str)
-            except Exception as e:
-                print(e)
-                return None
-            if not device:
-                device = _device
+            _queue = usm_array.queue
+            if not queue:
+                queue = _queue
             else:
-                if _device != device:
+                if _queue != queue:
                     return None
 
-        return device
+        return queue
 
     def _determine_kernel_launch_queue(self, args, argtypes):
         """Determines the queue where the kernel is to be launched.
@@ -437,14 +413,14 @@ def _determine_kernel_launch_queue(self, args, argtypes):
                 if i in usmarray_argnums
             ]
 
-            device = self._chk_compute_follows_data_compliance(usm_array_args)
+            queue = self._chk_compute_follows_data_compliance(usm_array_args)
 
-            if not device:
+            if not queue:
                 raise ComputeFollowsDataInferenceError(
                     self.kernel_name, usmarray_argnum_list=usmarray_argnums
                 )
 
-            return dpctl.SyclQueue(device)
+            return queue
         else:
             if dpctl.is_in_device_context():
                 warn(
@@ -617,7 +593,6 @@ def __call__(self, *args, global_range=None, local_range=None):
         # redundant. We should avoid these checks for the specialized case.
         exec_queue = self._determine_kernel_launch_queue(args, argtypes)
         backend = exec_queue.backend
-        device_type = exec_queue.sycl_device.device_type
 
         if exec_queue.backend not in [
             dpctl.backend_type.opencl,
@@ -637,8 +612,6 @@ def __call__(self, *args, global_range=None, local_range=None):
             tuple(argtypes),
             self.pyfunc,
             dpex_target.target_context.codegen(),
-            backend=backend,
-            device_type=device_type,
         )
 
         # If the JitKernel was specialized then raise exception if argtypes
@@ -661,8 +634,6 @@ def __call__(self, *args, global_range=None, local_range=None):
                     kernel_module_name,
                 ) = self._compile_and_cache(
                     argtypes=argtypes,
-                    backend=backend,
-                    device_type=device_type,
                     cache=self._cache,
                 )
 
diff --git a/numba_dpex/core/typeconv/array_conversion.py b/numba_dpex/core/typeconv/array_conversion.py
index 6e949c7349..5096045a90 100644
--- a/numba_dpex/core/typeconv/array_conversion.py
+++ b/numba_dpex/core/typeconv/array_conversion.py
@@ -38,6 +38,7 @@ def to_usm_ndarray(suai_attrs, addrspace=address_space.GLOBAL):
         layout=layout,
         usm_type=suai_attrs.usm_type,
         device=suai_attrs.device,
+        queue=suai_attrs.queue,
         readonly=not suai_attrs.is_writable,
         name=None,
         aligned=True,
diff --git a/numba_dpex/core/types/usm_ndarray_type.py b/numba_dpex/core/types/usm_ndarray_type.py
index 0e4ba2a851..b2b8cdd24b 100644
--- a/numba_dpex/core/types/usm_ndarray_type.py
+++ b/numba_dpex/core/types/usm_ndarray_type.py
@@ -21,8 +21,9 @@ def __init__(
         dtype,
         ndim,
         layout,
-        usm_type,
-        device,
+        usm_type="unknown",
+        device="unknown",
+        queue=None,
         readonly=False,
         name=None,
         aligned=True,
@@ -33,8 +34,13 @@ def __init__(
 
         # Normalize the device filter string and get the fully qualified three
         # tuple (backend:device_type:device_num) filter string from dpctl.
-        _d = dpctl.SyclDevice(device)
-        self.device = _d.filter_string
+        if device != "unknown":
+            _d = dpctl.SyclDevice(device)
+            self.device = _d.filter_string
+        else:
+            self.device = "unknown"
+
+        self.queue = queue
 
         if name is None:
             type_name = "usm_ndarray"
diff --git a/numba_dpex/core/typing/typeof.py b/numba_dpex/core/typing/typeof.py
index 144acfa5de..3293db3553 100644
--- a/numba_dpex/core/typing/typeof.py
+++ b/numba_dpex/core/typing/typeof.py
@@ -62,5 +62,6 @@ def typeof_usm_ndarray(val, c):
         readonly=readonly,
         usm_type=usm_type,
         device=device,
+        queue=val.sycl_queue,
         addrspace=address_space.GLOBAL,
     )
diff --git a/numba_dpex/core/utils/suai_helper.py b/numba_dpex/core/utils/suai_helper.py
index 101f1f75c4..4dd40880d9 100644
--- a/numba_dpex/core/utils/suai_helper.py
+++ b/numba_dpex/core/utils/suai_helper.py
@@ -27,6 +27,7 @@ def __init__(
         dtype,
         usm_type,
         device,
+        queue,
     ):
         self._data = data
         self._data_writeable = writable
@@ -38,6 +39,7 @@ def __init__(
         self._dtype = dtype
         self._usm_type = usm_type
         self._device = device
+        self._queue = queue
 
     @property
     def data(self):
@@ -79,6 +81,10 @@ def usm_type(self):
     def device(self):
         return self._device
 
+    @property
+    def queue(self):
+        return self._queue
+
 
 def get_info_from_suai(obj):
     """
@@ -138,6 +144,7 @@ def get_info_from_suai(obj):
         size=total_size,
         usm_type=usm_type,
         device=device,
+        queue=syclobj,
         shape=shape,
         dimensions=ndim,
         itemsize=itemsize,
diff --git a/numba_dpex/examples/kernel/device_func.py b/numba_dpex/examples/kernel/device_func.py
index 2ebc57537a..507ca0377f 100644
--- a/numba_dpex/examples/kernel/device_func.py
+++ b/numba_dpex/examples/kernel/device_func.py
@@ -10,7 +10,8 @@
 N = 10
 
 
-# A device callable function that can be invoked from ``kernel`` and other device functions
+# A device callable function that can be invoked from ``kernel`` and other
+# device functions
 @ndpex.func
 def a_device_function(a):
     return a + 1
diff --git a/numba_dpex/tests/kernel_tests/test_barrier.py b/numba_dpex/tests/kernel_tests/test_barrier.py
index 651adb0e2e..3c65093273 100644
--- a/numba_dpex/tests/kernel_tests/test_barrier.py
+++ b/numba_dpex/tests/kernel_tests/test_barrier.py
@@ -11,7 +11,7 @@
 from numba_dpex import float32, usm_ndarray, void
 from numba_dpex.tests._helper import filter_strings
 
-f32arrty = usm_ndarray(float32, 1, "C", usm_type="device", device="0")
+f32arrty = usm_ndarray(float32, 1, "C")
 
 
 @pytest.mark.parametrize("filter_str", filter_strings)
@@ -25,7 +25,7 @@ def twice(A):
         A[i] = d * 2
 
     N = 256
-    arr = dpt.arange(N, dtype=dpt.float32, device="0")
+    arr = dpt.arange(N, dtype=dpt.float32)
     orig = dpt.asnumpy(arr)
     twice[N, N // 2](arr)
     after = dpt.asnumpy(arr)
@@ -44,7 +44,7 @@ def twice(A):
         A[i] = d * 2
 
     N = 256
-    arr = dpt.arange(N, dtype=dpt.float32, device="0")
+    arr = dpt.arange(N, dtype=dpt.float32)
     orig = dpt.asnumpy(arr)
     twice[N, dpex.DEFAULT_LOCAL_SIZE](arr)
     after = dpt.asnumpy(arr)
@@ -68,7 +68,7 @@ def reverse_array(A):
         # write
         A[i] += lm[blocksize - 1 - i]
 
-    arr = dpt.arange(blocksize, dtype=dpt.float32, device="0")
+    arr = dpt.arange(blocksize, dtype=dpt.float32)
     orig = dpt.asnumpy(arr)
     reverse_array[blocksize, blocksize](arr)
     after = dpt.asnumpy(arr)
diff --git a/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py b/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py
index 7451711eea..d417924513 100644
--- a/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py
+++ b/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py
@@ -9,7 +9,7 @@
 import numba_dpex as dpex
 from numba_dpex import int32, usm_ndarray
 
-i32arrty = usm_ndarray(int32, 1, "C", usm_type="device", device="0")
+i32arrty = usm_ndarray(int32, 1, "C")
 
 
 def f(a):
diff --git a/numba_dpex/tests/kernel_tests/test_kernel_specialization.py b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
index dbe31275af..e99d0eeabb 100644
--- a/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
+++ b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
@@ -12,8 +12,8 @@
     MissingSpecializationError,
 )
 
-i64arrty = usm_ndarray(int64, 1, "C", usm_type="device", device="0")
-f32arrty = usm_ndarray(float32, 1, "C", usm_type="device", device="0")
+i64arrty = usm_ndarray(int64, 1, "C")
+f32arrty = usm_ndarray(float32, 1, "C")
 
 specialized_kernel1 = dpex.kernel((i64arrty, i64arrty, i64arrty))
 specialized_kernel2 = dpex.kernel(
@@ -66,9 +66,9 @@ def test_missing_specialization_error():
 
 def test_execution_of_specialized_kernel():
     """Test if the specialized kernel is correctly executed."""
-    a = dpt.ones(1024, dtype=dpt.int64, device="0")
-    b = dpt.ones(1024, dtype=dpt.int64, device="0")
-    c = dpt.zeros(1024, dtype=dpt.int64, device="0")
+    a = dpt.ones(1024, dtype=dpt.int64)
+    b = dpt.ones(1024, dtype=dpt.int64)
+    c = dpt.zeros(1024, dtype=dpt.int64)
 
     specialized_kernel1(data_parallel_sum)[
         1024,

From 42958f43c802c181f9a289808af42545459b2f34 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 12 Jan 2023 20:38:54 -0600
Subject: [PATCH 46/51] Rename AOT to eager compilation.

---
 ...ialization.py => kernel_specialization.py} | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)
 rename numba_dpex/examples/kernel/{aot_specialization.py => kernel_specialization.py} (81%)

diff --git a/numba_dpex/examples/kernel/aot_specialization.py b/numba_dpex/examples/kernel/kernel_specialization.py
similarity index 81%
rename from numba_dpex/examples/kernel/aot_specialization.py
rename to numba_dpex/examples/kernel/kernel_specialization.py
index ce0044c088..5798b80488 100644
--- a/numba_dpex/examples/kernel/aot_specialization.py
+++ b/numba_dpex/examples/kernel/kernel_specialization.py
@@ -14,11 +14,11 @@
     MissingSpecializationError,
 )
 
-# Similar to Numba, numba-dpex supports ahead-of-time (AOT) compilation of
-# functions. The following examples demonstrate the feature for
-# numba_dpex.kernel and presents usage scenarios and current limitations.
+# Similar to Numba, numba-dpex supports eager compilation of functions. The
+# following examples demonstrate the feature for numba_dpex.kernel and presents
+# usage scenarios and current limitations.
 
-# ------------                 AOT Example 1.                   ------------ #
+# ------------                 Example 1.                   ------------ #
 
 # Define type specializations using the numba_dpex usm_ndarray data type.
 i64arrty = usm_ndarray(int64, 1, "C", usm_type="device", device="0")
@@ -49,9 +49,9 @@ def data_parallel_sum(a, b, c):
 assert np.array_equal(npc, npc_expected)
 
 
-# ------------                 AOT Example 2.                   ------------ #
+# ------------                 Example 2.                   ------------ #
 
-# Multiple signatures can be specified as a list to AOT compile multiple
+# Multiple signatures can be specified as a list to eager compile multiple
 # versions of the kernel.
 
 # specialize a kernel for the i64arrty
@@ -91,9 +91,9 @@ def data_parallel_sum2(a, b, c):
 assert np.array_equal(npc, npc_expected)
 
 
-# ------------                 AOT Example 3.                   ------------ #
+# ------------                 Example 3.                   ------------ #
 
-# AOT specialized kernels cannot be jit compiled. Calling a specialized kernel
+# A specialized kernel cannot be jit compiled. Calling a specialized kernel
 # with arguments having type different from the specialization will result in
 # an MissingSpecializationError.
 
@@ -109,11 +109,11 @@ def data_parallel_sum2(a, b, c):
     print(mse)
 
 
-# ------------                 AOT Example 4.                   ------------ #
+# ------------                 Example 4.                   ------------ #
 
 # Numba_dpex does not support NumPy arrays as kernel arguments and all
 # array arguments should be inferable as a numba_dpex.types.usm_ndarray. Trying
-# to AOT with a NumPy array-based signature will lead to an
+# to eager compile with a NumPy array-based signature will lead to an
 # InvalidKernelSpecializationError
 
 try:

From 20a8f7da7818a531508ce09f639384965c151efa Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 17 Jan 2023 12:45:31 -0600
Subject: [PATCH 47/51] Formatting changes to error message.

---
 numba_dpex/core/exceptions.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index 3aefd4f489..558c81b01d 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -111,9 +111,10 @@ def __init__(self, kernel_name, ndims, max_work_item_dims) -> None:
 class UnmatchedNumberOfRangeDimsError(Exception):
     def __init__(self, kernel_name, global_ndims, local_ndims) -> None:
         self.message = (
-            f"Specified global_range for kernel {kernel_name} has {global_ndims} dimensions, "
-            f"while specified local_range with dimenstions of {local_ndims} doesn't match "
-            "with global_range."
+            f"Specified global_range for kernel {kernel_name} has "
+            f"{global_ndims} dimensions, "
+            f"while specified local_range with dimenstions of {local_ndims} "
+            "doesn't match with global_range."
         )
         super().__init__(self.message)
 
@@ -147,9 +148,9 @@ class UnsupportedGroupWorkItemSizeError(Exception):
     def __init__(self, kernel_name, dim, work_groups, work_items) -> None:
         self.message = (
             f"Attempting to launch kernel {kernel_name} with "
-            f"{work_groups} global work groups and {work_items} local work items "
-            f"in dimension {dim} is not supported. The global work groups must be "
-            f"able to divide local work items evenly."
+            f"{work_groups} global work groups and {work_items} local work "
+            f"items in dimension {dim} is not supported. The global work "
+            "groups must be evenly divisibly by the local work items."
         )
         super().__init__(self.message)
 

From e6924fc81e427b3641f9c324c0a81b243ca100c5 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 17 Jan 2023 15:05:58 -0600
Subject: [PATCH 48/51] Deprecate DEFAULT_LOCAL_SIZE as kernel launch arg.

   - The DEFAULT_LOCAL_SIZE is deprecated and users warned to
     provided a valid local range for nd_range kernels.
   - Removed the global_range and local_range kw args from
     JitKernel.__call__().
   - Undeprecate the JitKernel.__getitem__ call.
   - Fix and improve how arguments to JitKernel.__call__() are
     parsed to extract the global_range and local_range.
---
 .../core/kernel_interface/dispatcher.py       | 171 ++++++------------
 1 file changed, 58 insertions(+), 113 deletions(-)

diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 8fc00e65c3..2ef1b83f49 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -82,7 +82,6 @@ def __init__(
         self.compile_flags = compile_flags
         self.kernel_name = pyfunc.__name__
 
-        # TODO: To be removed once the__getitem__ is removed
         self._global_range = None
         self._local_range = None
 
@@ -461,132 +460,80 @@ def __getitem__(self, args):
             KernelLauncher: A clone of the KernelLauncher object, but with the
             global_range and local_range attributes initialized.
 
-        .. deprecated:: 0.19
         """
 
-        warn(
-            "The [] (__getitem__) method to set global and local ranges for "
-            + "launching a kernel is deprecated. "
-            + 'Set the "global_range" and the "local_range" keyword '
-            + "arguments when calling the kernel instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
+        if isinstance(args, int):
+            self._global_range = [args]
+            self._local_range = None
+        elif (isinstance(args, tuple) or isinstance(args, list)) and all(
+            isinstance(v, int) for v in args
+        ):
+            self._global_range = list(args)
+            self._local_range = None
+        elif isinstance(args, tuple) and len(args) == 2:
+
+            gr = args[0]
+            lr = args[1]
+            if isinstance(gr, int):
+                self._global_range = [gr]
+            elif all(isinstance(v, int) for v in gr) and len(gr) != 0:
+                self._global_range = list(gr)
+            else:
+                raise IllegalRangeValueError(kernel_name=self.kernel_name)
 
-        nargs = len(args)
-        # Check if the kernel launch arguments are sane.
-        if nargs < 1:
-            raise UnknownGlobalRangeError(kernel_name=self.kernel_name)
-        elif nargs > 2:
-            raise InvalidKernelLaunchArgsError(
-                kernel_name=self.kernel_name, args=args
-            )
-        self._global_range = args[0]
-        if nargs == 2 and args[1] != []:
-            self._local_range = args[1]
+            if isinstance(lr, int):
+                self._local_range = [lr]
+            elif isinstance(lr, list) and len(lr) == 0:
+                # deprecation warning
+                warn(
+                    "Specifying the local range as an empty list "
+                    "(DEFAULT_LOCAL_SIZE) is deprecated. The kernel will be "
+                    "executed as a basic data-parallel kernel over the global "
+                    "range. Specify a valid local range to execute the kernel "
+                    "as an ND-range kernel.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                self._local_range = None
+            elif all(isinstance(v, int) for v in lr) and len(lr) != 0:
+                self._local_range = list(lr)
+            else:
+                raise IllegalRangeValueError(kernel_name=self.kernel_name)
         else:
-            self._local_range = None
+            raise IllegalRangeValueError(kernel_name=self.kernel_name)
 
-        return copy.copy(self)
+        # FIXME:[::-1] is done as OpenCL and SYCl have different orders when
+        # it comes to specifying dimensions.
+        self._global_range = list(self._global_range)[::-1]
+        if self._local_range:
+            self._local_range = list(self._local_range)[::-1]
 
-    def _get_ranges(self, global_range, local_range, device):
-        """Helper to get the global and local range values needed to launch a
-        kernel.
+        return copy.copy(self)
 
-        The global and local range arguments can either be provided using the
-        __getitem__ method or as keyword arguments to the __call__ method.
-        The function verifies that the range values are specified using at least
-        one of the method.
+    def _check_ranges(self, device):
+        """Helper to get the validate the global and local range values prior
+        to launching a kernel.
 
         Args:
-            global_range (list or tuple): The global range to be used for kernel
-            launch.
-            local_range (list or tuple): The local range to be used for kernel
-            launch.
             device (dpctl.SyclDevice): The device on which to launch the kernel.
-
-        Raises:
-            UnknownGlobalRangeError: When no global range was specified for
-            kernel launch.
         """
-        if global_range:
-            if self._global_range:
-                warn(
-                    "Ignoring the previously set value of global_range and "
-                    + "using the value specified at the kernel call site."
-                )
-        else:
-            if self._global_range:
-                warn(
-                    "Use of __getitem__ to set the global_range attribute is "
-                    + 'deprecated. Use the keyword argument "global_range" '
-                    + "when calling the kernel to specify the global range."
-                )
-                global_range = self._global_range
-            else:
-                raise UnknownGlobalRangeError(self.kernel_name)
-
-        if local_range:
-            if self._local_range:
-                warn(
-                    "Ignoring the previously set value of local_range and "
-                    + "using the value specified at the kernel call site.."
-                )
-        else:
-            if self._local_range:
-                warn(
-                    "Use of __getitem__ to set the local_range attribute is "
-                    + 'deprecated. Use the keyword argument "local_range" '
-                    + "when calling the kernel to specify the local range."
-                )
-                local_range = self._local_range
-            else:
-                local_range = None
-                warn(
-                    "Kernel to be submitted without a local range letting "
-                    + "the SYCL runtime select a local range. The behavior "
-                    + "can lead to suboptimal performance in certain cases. "
-                    + "Consider setting the local range value for the kernel "
-                    + "execution.\n"
-                    + "The local_range keyword may be made a required argument "
-                    + "in the future when calling a kernel."
-                )
-
-        if isinstance(global_range, int):
-            global_range = [global_range]
-
         # If only global range value is provided, then the kernel is invoked
         # over an N-dimensional index space defined by a SYCL range<N>, where
         # N is one, two or three.
         # If both local and global range values are specified the kernel is
-        # invoked using a SYCL nd_range
+        # invoked as a SYCL nd_range kernel.
 
-        if global_range and not local_range:
-            self._check_range(global_range, device)
-            # FIXME:[::-1] is done as OpenCL and SYCl have different orders when
-            # it comes to specifying dimensions.
-            global_range = list(global_range)[::-1]
+        if self._global_range and not self._local_range:
+            self._check_range(self._global_range, device)
         else:
-            if isinstance(local_range, int):
-                local_range = [local_range]
             self._check_ndrange(
-                global_range=global_range,
-                local_range=local_range,
+                global_range=self._global_range,
+                local_range=self._local_range,
                 device=device,
             )
-            global_range = list(global_range)[::-1]
-            local_range = list(local_range)[::-1]
-
-        return (global_range, local_range)
 
-    def __call__(self, *args, global_range=None, local_range=None):
-        """Functor to launch a kernel.
-
-        Args:
-            global_range (list or tuple): optional global range for kernel
-            launch.
-            local_range (list or tuple): optional local range for kernel launch.
-        """
+    def __call__(self, *args):
+        """Functor to launch a kernel."""
         argtypes = [self.typingctx.resolve_argument_type(arg) for arg in args]
         # FIXME: For specialized and ahead of time compiled and cached kernels,
         # the CFD check was already done statically. The run-time check is
@@ -602,11 +549,6 @@ def __call__(self, *args, global_range=None, local_range=None):
                 self.kernel_name, backend, JitKernel._supported_backends
             )
 
-        # TODO: Refactor after __getitem__ is removed
-        global_range, local_range = self._get_ranges(
-            global_range, local_range, exec_queue.sycl_device
-        )
-
         # load the kernel from cache
         key = build_key(
             tuple(argtypes),
@@ -654,11 +596,14 @@ def __call__(self, *args, global_range=None, local_range=None):
             access_specifiers_list=self.array_access_specifiers,
         )
 
+        # Make sure the kernel lauch range/nd_range are sane
+        self._check_ranges(exec_queue.sycl_device)
+
         exec_queue.submit(
             sycl_kernel,
             packer.unpacked_args,
-            global_range,
-            local_range,
+            self._global_range,
+            self._local_range,
         )
 
         exec_queue.wait()

From 75f269b95f2d9c31eb58047f077553d14d9fd210 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 17 Jan 2023 19:07:14 -0600
Subject: [PATCH 49/51] Update tests after changes to kernel lauch params.

---
 numba_dpex/tests/kernel_tests/test_atomic_op.py          | 4 +++-
 numba_dpex/tests/kernel_tests/test_barrier.py            | 8 +++++---
 numba_dpex/tests/kernel_tests/test_caching.py            | 6 ++++--
 numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py | 6 +++---
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/numba_dpex/tests/kernel_tests/test_atomic_op.py b/numba_dpex/tests/kernel_tests/test_atomic_op.py
index 95e25d08f9..404365cc71 100644
--- a/numba_dpex/tests/kernel_tests/test_atomic_op.py
+++ b/numba_dpex/tests/kernel_tests/test_atomic_op.py
@@ -120,7 +120,9 @@ def test_kernel_atomic_local(filter_str, input_arrays, return_list_of_op):
     kernel = dpex.kernel(f)
     device = dpctl.SyclDevice(filter_str)
     with dpctl.device_context(device):
-        kernel[global_size, global_size](a)
+        gs = (N,)
+        ls = (N,)
+        kernel[gs, ls](a)
     assert a[0] == expected
 
 
diff --git a/numba_dpex/tests/kernel_tests/test_barrier.py b/numba_dpex/tests/kernel_tests/test_barrier.py
index 3c65093273..9223caa641 100644
--- a/numba_dpex/tests/kernel_tests/test_barrier.py
+++ b/numba_dpex/tests/kernel_tests/test_barrier.py
@@ -27,7 +27,9 @@ def twice(A):
     N = 256
     arr = dpt.arange(N, dtype=dpt.float32)
     orig = dpt.asnumpy(arr)
-    twice[N, N // 2](arr)
+    global_size = (N,)
+    local_size = (N // 2,)
+    twice[global_size, local_size](arr)
     after = dpt.asnumpy(arr)
     # The computation is correct?
     np.testing.assert_allclose(orig * 2, after)
@@ -46,7 +48,7 @@ def twice(A):
     N = 256
     arr = dpt.arange(N, dtype=dpt.float32)
     orig = dpt.asnumpy(arr)
-    twice[N, dpex.DEFAULT_LOCAL_SIZE](arr)
+    twice[N](arr)
     after = dpt.asnumpy(arr)
     # The computation is correct?
     np.testing.assert_allclose(orig * 2, after)
@@ -70,7 +72,7 @@ def reverse_array(A):
 
     arr = dpt.arange(blocksize, dtype=dpt.float32)
     orig = dpt.asnumpy(arr)
-    reverse_array[blocksize, blocksize](arr)
+    reverse_array[(blocksize,), (blocksize,)](arr)
     after = dpt.asnumpy(arr)
     expected = orig[::-1] + orig
     np.testing.assert_allclose(expected, after)
diff --git a/numba_dpex/tests/kernel_tests/test_caching.py b/numba_dpex/tests/kernel_tests/test_caching.py
index 58c7fa1ed4..abeb77723d 100644
--- a/numba_dpex/tests/kernel_tests/test_caching.py
+++ b/numba_dpex/tests/kernel_tests/test_caching.py
@@ -48,9 +48,11 @@ def data_parallel_sum(x, y, z):
         ),
     )
 
+    d_launcher = d[100]
+
     N = 10
     for i in range(N):
-        d(a, b, c, global_range=[100])
+        d_launcher(a, b, c)
     actual = dpt.asnumpy(c)
 
-    assert np.array_equal(expected, actual) and (d.cache_hits == N - 1)
+    assert np.array_equal(expected, actual) and (d_launcher.cache_hits == N - 1)
diff --git a/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py b/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py
index e92fdbea28..9211a5366b 100644
--- a/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py
+++ b/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py
@@ -30,9 +30,9 @@ def test_ndrange_config_error(error, ndrange):
     ndrange kernel with unspported arguments.
     """
 
-    a = dpt.ones(1024, dtype=dpt.int32, device="0")
-    b = dpt.ones(1024, dtype=dpt.int32, device="0")
-    c = dpt.zeros(1024, dtype=dpt.int64, device="0")
+    a = dpt.ones(1024, dtype=dpt.int32)
+    b = dpt.ones(1024, dtype=dpt.int32)
+    c = dpt.zeros(1024, dtype=dpt.int64)
 
     with pytest.raises(error):
         kernel_vector_sum[ndrange](a, b, c)

From 664d57e25098e33445cd947cffcdc4ad3cb49394 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Tue, 17 Jan 2023 21:57:53 -0600
Subject: [PATCH 50/51] Update kernel examples based on latest changes.

---
 numba_dpex/examples/kernel/atomic_op.py       |  2 +-
 numba_dpex/examples/kernel/black_scholes.py   |  4 +-
 numba_dpex/examples/kernel/device_func.py     |  2 +-
 numba_dpex/examples/kernel/interpolation.py   |  7 ++-
 .../{ => kernel}/kernel_private_memory.py     | 18 ++++---
 .../examples/kernel/kernel_specialization.py  | 53 +++++++++---------
 numba_dpex/examples/{ => kernel}/matmul.py    | 17 +++---
 .../{ => kernel}/pairwise_distance.py         |  4 +-
 numba_dpex/examples/kernel/scan.py            |  2 +-
 .../{ => kernel}/select_device_for_kernel.py  |  4 +-
 .../{ => kernel}/sum_reduction_ocl.py         | 18 +++----
 .../sum_reduction_recursive_ocl.py            | 46 +++++-----------
 numba_dpex/examples/kernel/vector_sum.py      |  2 +-
 .../{sum2D.py => kernel/vector_sum2D.py}      | 27 ++++++----
 numba_dpex/examples/sum.py                    | 54 -------------------
 numba_dpex/examples/sum_ndarray.py            | 53 ------------------
 numba_dpex/examples/sum_reduction.py          |  4 +-
 numba_dpex/examples/usm_ndarray.py            | 46 ----------------
 18 files changed, 93 insertions(+), 270 deletions(-)
 rename numba_dpex/examples/{ => kernel}/kernel_private_memory.py (75%)
 rename numba_dpex/examples/{ => kernel}/matmul.py (73%)
 rename numba_dpex/examples/{ => kernel}/pairwise_distance.py (98%)
 rename numba_dpex/examples/{ => kernel}/select_device_for_kernel.py (97%)
 rename numba_dpex/examples/{ => kernel}/sum_reduction_ocl.py (78%)
 rename numba_dpex/examples/{ => kernel}/sum_reduction_recursive_ocl.py (64%)
 rename numba_dpex/examples/{sum2D.py => kernel/vector_sum2D.py} (56%)
 delete mode 100644 numba_dpex/examples/sum.py
 delete mode 100644 numba_dpex/examples/sum_ndarray.py
 delete mode 100644 numba_dpex/examples/usm_ndarray.py

diff --git a/numba_dpex/examples/kernel/atomic_op.py b/numba_dpex/examples/kernel/atomic_op.py
index 4e11546958..2e10f7cc18 100644
--- a/numba_dpex/examples/kernel/atomic_op.py
+++ b/numba_dpex/examples/kernel/atomic_op.py
@@ -20,7 +20,7 @@ def main():
     print("Using device ...")
     print(a.device)
 
-    atomic_reduction[N, ndpex.DEFAULT_LOCAL_SIZE](a)
+    atomic_reduction[N](a)
     print("Reduction sum =", a[0])
 
     print("Done...")
diff --git a/numba_dpex/examples/kernel/black_scholes.py b/numba_dpex/examples/kernel/black_scholes.py
index fa61cab3cd..3f6e9c5bd6 100644
--- a/numba_dpex/examples/kernel/black_scholes.py
+++ b/numba_dpex/examples/kernel/black_scholes.py
@@ -94,9 +94,7 @@ def main():
     print("Using device ...")
     print(price.device)
 
-    kernel_black_scholes[NOPT, ndpx.DEFAULT_LOCAL_SIZE](
-        price, strike, t, rate, volatility, call, put
-    )
+    kernel_black_scholes[NOPT](price, strike, t, rate, volatility, call, put)
 
     print("Call:", call)
     print("Put:", put)
diff --git a/numba_dpex/examples/kernel/device_func.py b/numba_dpex/examples/kernel/device_func.py
index 507ca0377f..939a79336b 100644
--- a/numba_dpex/examples/kernel/device_func.py
+++ b/numba_dpex/examples/kernel/device_func.py
@@ -33,7 +33,7 @@ def a_kernel_function(a, b):
 # Utility function for printing
 def driver(a, b, N):
     print("A=", a)
-    a_kernel_function[N, ndpex.DEFAULT_LOCAL_SIZE](a, b)
+    a_kernel_function[N](a, b)
     print("B=", b)
 
 
diff --git a/numba_dpex/examples/kernel/interpolation.py b/numba_dpex/examples/kernel/interpolation.py
index 06632d98c0..7568ad60e7 100644
--- a/numba_dpex/examples/kernel/interpolation.py
+++ b/numba_dpex/examples/kernel/interpolation.py
@@ -114,10 +114,9 @@ def main():
 
     print("Using device ...")
     print(xp.device)
-
-    kernel_polynomial[N_POINTS // N_POINTS_PER_WORK_ITEM, LOCAL_SIZE](
-        xp, yp, COEFFICIENTS
-    )
+    global_range = (N_POINTS // N_POINTS_PER_WORK_ITEM,)
+    local_range = (LOCAL_SIZE,)
+    kernel_polynomial[global_range, local_range](xp, yp, COEFFICIENTS)
 
     # Copy results back to the host
     nyp = np.asnumpy(yp)
diff --git a/numba_dpex/examples/kernel_private_memory.py b/numba_dpex/examples/kernel/kernel_private_memory.py
similarity index 75%
rename from numba_dpex/examples/kernel_private_memory.py
rename to numba_dpex/examples/kernel/kernel_private_memory.py
index 537b02c9b2..089f8b41d4 100644
--- a/numba_dpex/examples/kernel_private_memory.py
+++ b/numba_dpex/examples/kernel/kernel_private_memory.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpctl
+import dpctl.tensor as dpt
 import numpy as np
 from numba import float32
 
@@ -30,21 +31,22 @@ def private_memory_kernel(A):
         A[i] = memory[0] * 2
 
     N = 4
-    arr = np.zeros(N).astype(np.float32)
+    device = dpctl.select_default_device()
+
+    arr = dpt.zeros(N, dtype=dpt.float32, device=device)
     orig = np.arange(N).astype(np.float32)
 
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
     print("Using device ...")
     device.print_device_info()
 
-    with numba_dpex.offload_to_sycl_device(device):
-        private_memory_kernel[N, N](arr)
+    global_range = (N,)
+    local_range = (N,)
+    private_memory_kernel[global_range, local_range](arr)
 
-    np.testing.assert_allclose(orig * 2, arr)
+    arr_out = dpt.asnumpy(arr)
+    np.testing.assert_allclose(orig * 2, arr_out)
     # the output should be `orig[i] * 2, i.e. [0, 2, 4, ..]``
-    print(arr)
+    print(arr_out)
 
 
 def main():
diff --git a/numba_dpex/examples/kernel/kernel_specialization.py b/numba_dpex/examples/kernel/kernel_specialization.py
index 5798b80488..a3cd7fa759 100644
--- a/numba_dpex/examples/kernel/kernel_specialization.py
+++ b/numba_dpex/examples/kernel/kernel_specialization.py
@@ -2,8 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import logging
-
 import dpctl.tensor as dpt
 import numpy as np
 
@@ -21,8 +19,8 @@
 # ------------                 Example 1.                   ------------ #
 
 # Define type specializations using the numba_dpex usm_ndarray data type.
-i64arrty = usm_ndarray(int64, 1, "C", usm_type="device", device="0")
-f32arrty = usm_ndarray(float32, 1, "C", usm_type="device", device="0")
+i64arrty = usm_ndarray(int64, 1, "C")
+f32arrty = usm_ndarray(float32, 1, "C")
 
 
 # specialize a kernel for the i64arrty
@@ -36,13 +34,11 @@ def data_parallel_sum(a, b, c):
 
 
 # run the specialized kernel
-a = dpt.ones(1024, dtype=dpt.int64, device="0")
-b = dpt.ones(1024, dtype=dpt.int64, device="0")
-c = dpt.zeros(1024, dtype=dpt.int64, device="0")
+a = dpt.ones(1024, dtype=dpt.int64)
+b = dpt.ones(1024, dtype=dpt.int64)
+c = dpt.zeros(1024, dtype=dpt.int64)
 
-data_parallel_sum[
-    1024,
-](a, b, c)
+data_parallel_sum[1024](a, b, c)
 
 npc = dpt.asnumpy(c)
 npc_expected = np.full(1024, 2, dtype=np.int64)
@@ -65,26 +61,22 @@ def data_parallel_sum2(a, b, c):
 
 
 # run the i64 specialized kernel
-a = dpt.ones(1024, dtype=dpt.int64, device="0")
-b = dpt.ones(1024, dtype=dpt.int64, device="0")
-c = dpt.zeros(1024, dtype=dpt.int64, device="0")
+a = dpt.ones(1024, dtype=dpt.int64)
+b = dpt.ones(1024, dtype=dpt.int64)
+c = dpt.zeros(1024, dtype=dpt.int64)
 
-data_parallel_sum2[
-    1024,
-](a, b, c)
+data_parallel_sum2[1024](a, b, c)
 
 npc = dpt.asnumpy(c)
 npc_expected = np.full(1024, 2, dtype=np.int64)
 assert np.array_equal(npc, npc_expected)
 
 # run the f32 specialized kernel
-a = dpt.ones(1024, dtype=dpt.float32, device="0")
-b = dpt.ones(1024, dtype=dpt.float32, device="0")
-c = dpt.zeros(1024, dtype=dpt.float32, device="0")
+a = dpt.ones(1024, dtype=dpt.float32)
+b = dpt.ones(1024, dtype=dpt.float32)
+c = dpt.zeros(1024, dtype=dpt.float32)
 
-data_parallel_sum2[
-    1024,
-](a, b, c)
+data_parallel_sum2[1024](a, b, c)
 
 npc = dpt.asnumpy(c)
 npc_expected = np.full(1024, 2, dtype=np.float32)
@@ -102,9 +94,7 @@ def data_parallel_sum2(a, b, c):
 c = dpt.zeros(1024, dtype=dpt.int32)
 
 try:
-    data_parallel_sum[
-        1024,
-    ](a, b, c)
+    data_parallel_sum[1024](a, b, c)
 except MissingSpecializationError as mse:
     print(mse)
 
@@ -118,8 +108,9 @@ def data_parallel_sum2(a, b, c):
 
 try:
     dpex.kernel((int64[::1], int64[::1], int64[::1]))
-except InvalidKernelSpecializationError:
-    logging.exception()
+except InvalidKernelSpecializationError as e:
+    print("Dpex kernels cannot be specialized using NumPy arrays.")
+    print(e)
 
 
 # ------------                 Limitations                       ------------ #
@@ -131,5 +122,9 @@ def data_parallel_sum2(a, b, c):
 
 try:
     dpex.kernel("(i64arrty, i64arrty, i64arrty)")
-except NotImplementedError:
-    logging.exception()
+except NotImplementedError as e:
+    print(
+        "Dpex kernels cannot be specialized using signatures specified as "
+        "strings."
+    )
+    print(e)
diff --git a/numba_dpex/examples/matmul.py b/numba_dpex/examples/kernel/matmul.py
similarity index 73%
rename from numba_dpex/examples/matmul.py
rename to numba_dpex/examples/kernel/matmul.py
index 17851e97d3..a40ccc207b 100644
--- a/numba_dpex/examples/matmul.py
+++ b/numba_dpex/examples/kernel/matmul.py
@@ -5,6 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpctl
+import dpctl.tensor as dpt
 import numpy as np
 
 import numba_dpex as dpex
@@ -41,16 +42,20 @@ def driver(a, b, c):
 def main():
     a = np.arange(X * X, dtype=np.float32).reshape(X, X)
     b = np.array(np.random.random(X * X), dtype=np.float32).reshape(X, X)
-    c = np.ones_like(a).reshape(X, X)
 
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
     device = dpctl.select_default_device()
+    a_dpt = dpt.arange(X * X, dtype=dpt.float32, device=device)
+    a_dpt = dpt.reshape(a_dpt, (X, X))
+    b_dpt = dpt.asarray(b, dtype=dpt.float32, device=device)
+    b_dpt = dpt.reshape(b_dpt, (X, X))
+    c_dpt = dpt.ones_like(a_dpt)
+    c_dpt = dpt.reshape(c_dpt, (X, X))
+
     print("Using device ...")
     device.print_device_info()
 
-    with dpctl.device_context(device):
-        driver(a, b, c)
+    driver(a_dpt, b_dpt, c_dpt)
+    c_out = dpt.asnumpy(c_dpt)
 
     # Host compute using standard NumPy
     Amat = np.matrix(a)
@@ -58,7 +63,7 @@ def main():
     Cans = Amat * Bmat
 
     # Check result
-    assert np.allclose(c, Cans)
+    assert np.allclose(c_out, Cans)
 
     print("Done...")
 
diff --git a/numba_dpex/examples/pairwise_distance.py b/numba_dpex/examples/kernel/pairwise_distance.py
similarity index 98%
rename from numba_dpex/examples/pairwise_distance.py
rename to numba_dpex/examples/kernel/pairwise_distance.py
index 377a2b30d1..30d940a871 100644
--- a/numba_dpex/examples/pairwise_distance.py
+++ b/numba_dpex/examples/kernel/pairwise_distance.py
@@ -25,9 +25,9 @@
 args = parser.parse_args()
 
 # Global work size is equal to the number of points
-global_size = args.n
+global_size = (args.n,)
 # Local Work size is optional
-local_size = args.l
+local_size = (args.l,)
 
 X = np.random.random((args.n, args.d)).astype(np.single)
 D = np.empty((args.n, args.n), dtype=np.single)
diff --git a/numba_dpex/examples/kernel/scan.py b/numba_dpex/examples/kernel/scan.py
index 85ecdf86a5..6ee4056fbb 100644
--- a/numba_dpex/examples/kernel/scan.py
+++ b/numba_dpex/examples/kernel/scan.py
@@ -56,7 +56,7 @@ def main():
 
     print("Using device ...")
     print(arr.device)
-    kernel_hillis_steele_scan[N, ndpx.DEFAULT_LOCAL_SIZE](arr)
+    kernel_hillis_steele_scan[N](arr)
 
     # the output should be [0, 1, 3, 6, ...]
     arr_np = np.asnumpy(arr)
diff --git a/numba_dpex/examples/select_device_for_kernel.py b/numba_dpex/examples/kernel/select_device_for_kernel.py
similarity index 97%
rename from numba_dpex/examples/select_device_for_kernel.py
rename to numba_dpex/examples/kernel/select_device_for_kernel.py
index 3107efea49..7c08d7e9eb 100644
--- a/numba_dpex/examples/select_device_for_kernel.py
+++ b/numba_dpex/examples/kernel/select_device_for_kernel.py
@@ -86,7 +86,7 @@ def select_device_ndarray(N):
     default_device = dpctl.select_default_device()
 
     with numba_dpex.offload_to_sycl_device(default_device.filter_string):
-        sum_kernel[N, 1](a, b, got)
+        sum_kernel[(N,), (1,)](a, b, got)
 
     expected = a + b
 
@@ -110,7 +110,7 @@ def select_device_SUAI(N):
 
     # Users don't need to specify where the computation will
     # take place. It will be inferred from data.
-    sum_kernel[N, 1](da, db, dc)
+    sum_kernel[(N,), (1,)](da, db, dc)
 
     dc.usm_data.copy_to_host(got.reshape((-1)).view("|u1"))
 
diff --git a/numba_dpex/examples/sum_reduction_ocl.py b/numba_dpex/examples/kernel/sum_reduction_ocl.py
similarity index 78%
rename from numba_dpex/examples/sum_reduction_ocl.py
rename to numba_dpex/examples/kernel/sum_reduction_ocl.py
index 949890d708..b6bf8e61ab 100644
--- a/numba_dpex/examples/sum_reduction_ocl.py
+++ b/numba_dpex/examples/kernel/sum_reduction_ocl.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpctl
-import numpy as np
+import dpctl.tensor as dpt
 from numba import int32
 
 import numba_dpex as dpex
@@ -47,16 +47,11 @@ def sum_reduce(A):
     # nb_work_groups have to be even for this implementation
     nb_work_groups = global_size // work_group_size
 
-    partial_sums = np.zeros(nb_work_groups).astype(A.dtype)
+    partial_sums = dpt.zeros(nb_work_groups, dtype=A.dtype, device=A.device)
 
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        sum_reduction_kernel[global_size, work_group_size](A, partial_sums)
+    gs = (global_size,)
+    ls = (work_group_size,)
+    sum_reduction_kernel[gs, ls](A, partial_sums)
 
     final_sum = 0
     # calculate the final sum in HOST
@@ -68,7 +63,8 @@ def sum_reduce(A):
 
 def test_sum_reduce():
     N = 1024
-    A = np.ones(N).astype(np.int32)
+    device = dpctl.select_default_device()
+    A = dpt.ones(N, dtype=dpt.int32, device=device)
 
     print("Running Device + Host reduction")
 
diff --git a/numba_dpex/examples/sum_reduction_recursive_ocl.py b/numba_dpex/examples/kernel/sum_reduction_recursive_ocl.py
similarity index 64%
rename from numba_dpex/examples/sum_reduction_recursive_ocl.py
rename to numba_dpex/examples/kernel/sum_reduction_recursive_ocl.py
index bdaebc55df..7bfd401f02 100644
--- a/numba_dpex/examples/sum_reduction_recursive_ocl.py
+++ b/numba_dpex/examples/kernel/sum_reduction_recursive_ocl.py
@@ -9,8 +9,7 @@
 """
 
 import dpctl
-import dpctl.memory as dpctl_mem
-import numpy as np
+import dpctl.tensor as dpt
 from numba import int32
 
 import numba_dpex as dpex
@@ -59,12 +58,13 @@ def sum_recursive_reduction(size, group_size, Dinp, Dpartial_sums):
             nb_work_groups += 1
             passed_size = nb_work_groups * group_size
 
-    sum_reduction_kernel[passed_size, group_size](Dinp, size, Dpartial_sums)
+    gr = (passed_size,)
+    lr = (group_size,)
+
+    sum_reduction_kernel[gr, lr](Dinp, size, Dpartial_sums)
 
     if nb_work_groups <= group_size:
-        sum_reduction_kernel[group_size, group_size](
-            Dpartial_sums, nb_work_groups, Dinp
-        )
+        sum_reduction_kernel[lr, lr](Dpartial_sums, nb_work_groups, Dinp)
         result = Dinp[0]
     else:
         result = sum_recursive_reduction(
@@ -81,40 +81,18 @@ def sum_reduce(A):
     if (global_size % work_group_size) != 0:
         nb_work_groups += 1
 
-    partial_sums = np.zeros(nb_work_groups).astype(A.dtype)
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device) as q:
-        inp_buf = dpctl_mem.MemoryUSMShared(A.size * A.dtype.itemsize, queue=q)
-        inp_ndarray = np.ndarray(A.shape, buffer=inp_buf, dtype=A.dtype)
-        np.copyto(inp_ndarray, A)
-
-        partial_sums_buf = dpctl_mem.MemoryUSMShared(
-            partial_sums.size * partial_sums.dtype.itemsize, queue=q
-        )
-        partial_sums_ndarray = np.ndarray(
-            partial_sums.shape,
-            buffer=partial_sums_buf,
-            dtype=partial_sums.dtype,
-        )
-        np.copyto(partial_sums_ndarray, partial_sums)
-
-        result = sum_recursive_reduction(
-            global_size, work_group_size, inp_ndarray, partial_sums_ndarray
-        )
+    partial_sums = dpt.zeros(nb_work_groups, dtype=A.dtype, device=A.device)
+    result = sum_recursive_reduction(
+        global_size, work_group_size, A, partial_sums
+    )
 
     return result
 
 
 def test_sum_reduce():
     N = 20000
-
-    A = np.ones(N).astype(np.int32)
+    device = dpctl.select_default_device()
+    A = dpt.ones(N, dtype=dpt.int32, device=device)
 
     print("Running recursive reduction")
 
diff --git a/numba_dpex/examples/kernel/vector_sum.py b/numba_dpex/examples/kernel/vector_sum.py
index 4ebda8f38a..cb1b9fa2bb 100644
--- a/numba_dpex/examples/kernel/vector_sum.py
+++ b/numba_dpex/examples/kernel/vector_sum.py
@@ -18,7 +18,7 @@ def kernel_vector_sum(a, b, c):
 # Utility function for printing and testing
 def driver(a, b, c, global_size):
 
-    kernel_vector_sum[global_size, ndpx.DEFAULT_LOCAL_SIZE](a, b, c)
+    kernel_vector_sum[global_size](a, b, c)
 
     a_np = dpnp.asnumpy(a)  # Copy dpnp array a to NumPy array a_np
     b_np = dpnp.asnumpy(b)  # Copy dpnp array b to NumPy array b_np
diff --git a/numba_dpex/examples/sum2D.py b/numba_dpex/examples/kernel/vector_sum2D.py
similarity index 56%
rename from numba_dpex/examples/sum2D.py
rename to numba_dpex/examples/kernel/vector_sum2D.py
index 58ff015862..089721b7c1 100644
--- a/numba_dpex/examples/sum2D.py
+++ b/numba_dpex/examples/kernel/vector_sum2D.py
@@ -5,6 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpctl
+import dpctl.tensor as dpt
 import numpy as np
 
 import numba_dpex as dpex
@@ -21,10 +22,7 @@ def data_parallel_sum(a, b, c):
 
 
 def driver(a, b, c, global_size):
-    print("before A: ", a)
-    print("before B: ", b)
-    data_parallel_sum[global_size, dpex.DEFAULT_LOCAL_SIZE](a, b, c)
-    print("after  C : ", c)
+    data_parallel_sum[global_size](a, b, c)
 
 
 def main():
@@ -34,19 +32,26 @@ def main():
     global_size = X, Y
 
     a = np.arange(X * Y, dtype=np.float32).reshape(X, Y)
-    b = np.array(np.random.random(X * Y), dtype=np.float32).reshape(X, Y)
-    c = np.ones_like(a).reshape(X, Y)
+    b = np.arange(X * Y, dtype=np.float32).reshape(X, Y)
+    c = np.empty_like(a).reshape(X, Y)
+
+    c = a + b
 
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
     device = dpctl.select_default_device()
+    a_dpt = dpt.arange(X * Y, dtype=dpt.float32, device=device)
+    a_dpt = dpt.reshape(a_dpt, (X, Y))
+    b_dpt = dpt.arange(X * Y, dtype=dpt.float32, device=device)
+    b_dpt = dpt.reshape(b_dpt, (X, Y))
+    c_dpt = dpt.empty_like(a_dpt)
+    c_dpt = dpt.reshape(c_dpt, (X, Y))
+
     print("Using device ...")
     device.print_device_info()
 
-    with dpctl.device_context(device):
-        driver(a, b, c, global_size)
+    driver(a_dpt, b_dpt, c_dpt, global_size)
 
-    print(c)
+    c_out = dpt.asnumpy(c_dpt)
+    assert np.allclose(c, c_out)
 
     print("Done...")
 
diff --git a/numba_dpex/examples/sum.py b/numba_dpex/examples/sum.py
deleted file mode 100644
index 063ab250ee..0000000000
--- a/numba_dpex/examples/sum.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#! /usr/bin/env python
-
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import dpctl
-import numpy as np
-import numpy.testing as testing
-
-import numba_dpex as dpex
-
-
-@dpex.kernel
-def data_parallel_sum(a, b, c):
-    """
-    Vector addition using the ``kernel`` decorator.
-    """
-    i = dpex.get_global_id(0)
-    c[i] = a[i] + b[i]
-
-
-def driver(a, b, c, global_size):
-    print("A : ", a)
-    print("B : ", b)
-    data_parallel_sum[global_size, dpex.DEFAULT_LOCAL_SIZE](a, b, c)
-    print("A + B = ")
-    print("C ", c)
-    testing.assert_equal(c, a + b)
-
-
-def main():
-    global_size = 10
-    N = global_size
-    print("N", N)
-
-    a = np.array(np.random.random(N), dtype=np.float32)
-    b = np.array(np.random.random(N), dtype=np.float32)
-    c = np.ones_like(a)
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        driver(a, b, c, global_size)
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/numba_dpex/examples/sum_ndarray.py b/numba_dpex/examples/sum_ndarray.py
deleted file mode 100644
index 1545cf6b53..0000000000
--- a/numba_dpex/examples/sum_ndarray.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#! /usr/bin/env python
-
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import dpctl
-import numpy as np
-from _helper import has_cpu, has_gpu
-
-import numba_dpex as dpex
-
-
-@dpex.kernel(
-    access_types={
-        "read_only": ["a", "b"],
-        "write_only": ["c"],
-        "read_write": [],
-    }
-)
-def data_parallel_sum(a, b, c):
-    i = dpex.get_global_id(0)
-    c[i] = a[i] + b[i]
-
-
-global_size = 64
-local_size = 32
-N = global_size * local_size
-
-a = np.arange(N, dtype=np.float32)
-b = np.arange(N, dtype=np.float32)
-c = np.empty_like(a)
-
-
-def main():
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        print("before A: ", a)
-        print("before B: ", b)
-        data_parallel_sum[global_size, local_size](a, b, c)
-        print("after  C: ", c)
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/numba_dpex/examples/sum_reduction.py b/numba_dpex/examples/sum_reduction.py
index f3e3a2518b..cecafa5603 100644
--- a/numba_dpex/examples/sum_reduction.py
+++ b/numba_dpex/examples/sum_reduction.py
@@ -34,9 +34,7 @@ def sum_reduce(A):
     with dpctl.device_context(device):
         while total > 1:
             global_size = total // 2
-            sum_reduction_kernel[global_size, dpex.DEFAULT_LOCAL_SIZE](
-                A, R, global_size
-            )
+            sum_reduction_kernel[global_size](A, R, global_size)
             total = total // 2
 
     return R[0]
diff --git a/numba_dpex/examples/usm_ndarray.py b/numba_dpex/examples/usm_ndarray.py
deleted file mode 100644
index 219b01d42d..0000000000
--- a/numba_dpex/examples/usm_ndarray.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import dpctl
-import dpctl.tensor as dpt
-import numpy.testing as testing
-
-import numba_dpex as dpex
-
-
-@dpex.kernel
-def vector_add(a, b, c):
-    """
-    Vector addition using the ``kernel`` decorator.
-    """
-    i = dpex.get_global_id(0)
-    c[i] = a[i] + b[i]
-
-
-def driver(a, b, c, global_size):
-    vector_add[global_size, dpex.DEFAULT_LOCAL_SIZE](a, b, c)
-    npa = dpt.asnumpy(a)
-    npb = dpt.asnumpy(b)
-    npc = dpt.asnumpy(c)
-    testing.assert_equal(npc, npa + npb)
-
-
-def main():
-    N = 1024
-    print("N", N)
-
-    a = dpt.arange(N)
-    b = dpt.arange(N)
-    c = dpt.zeros(N)
-
-    print("Using device ...")
-    a.sycl_device.print_device_info()
-
-    driver(a, b, c, N)
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()

From c74ea24628904640bdc645d6e3553d9f0e52d675 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Wed, 18 Jan 2023 11:42:28 -0600
Subject: [PATCH 51/51] Improve dispatcher checks for laych args and add unit
 test.

---
 numba_dpex/core/exceptions.py                 |  13 +-
 .../core/kernel_interface/dispatcher.py       |  67 +++++------
 .../kernel_tests/test_kernel_launch_params.py | 113 ++++++++++++++++++
 3 files changed, 148 insertions(+), 45 deletions(-)
 create mode 100644 numba_dpex/tests/kernel_tests/test_kernel_launch_params.py

diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index 558c81b01d..9a4a7563c6 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -55,18 +55,9 @@ class InvalidKernelLaunchArgsError(Exception):
     """
 
     def __init__(self, kernel_name):
-        warn(
-            "The InvalidKernelLaunchArgsError class is deprecated, and will "
-            + "be removed once kernel launching using __getitem__ for the "
-            + "KernelLauncher class is removed.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
         self.message = (
-            "Invalid launch arguments specified for launching the Kernel "
-            f'"{kernel_name}". Launch arguments can only be a tuple '
-            "specifying the global range or a tuple of tuples specifying "
-            "global and local ranges."
+            "Invalid global and local range arguments specified for launching "
+            f' the Kernel "{kernel_name}". Refer documentation for details.'
         )
         super().__init__(self.message)
 
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
index 2ef1b83f49..ed602199ca 100644
--- a/numba_dpex/core/kernel_interface/dispatcher.py
+++ b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -461,46 +461,45 @@ def __getitem__(self, args):
             global_range and local_range attributes initialized.
 
         """
-
         if isinstance(args, int):
             self._global_range = [args]
             self._local_range = None
-        elif (isinstance(args, tuple) or isinstance(args, list)) and all(
-            isinstance(v, int) for v in args
-        ):
-            self._global_range = list(args)
-            self._local_range = None
-        elif isinstance(args, tuple) and len(args) == 2:
-
-            gr = args[0]
-            lr = args[1]
-            if isinstance(gr, int):
-                self._global_range = [gr]
-            elif all(isinstance(v, int) for v in gr) and len(gr) != 0:
-                self._global_range = list(gr)
-            else:
-                raise IllegalRangeValueError(kernel_name=self.kernel_name)
-
-            if isinstance(lr, int):
-                self._local_range = [lr]
-            elif isinstance(lr, list) and len(lr) == 0:
-                # deprecation warning
-                warn(
-                    "Specifying the local range as an empty list "
-                    "(DEFAULT_LOCAL_SIZE) is deprecated. The kernel will be "
-                    "executed as a basic data-parallel kernel over the global "
-                    "range. Specify a valid local range to execute the kernel "
-                    "as an ND-range kernel.",
-                    DeprecationWarning,
-                    stacklevel=2,
-                )
+        elif isinstance(args, tuple) or isinstance(args, list):
+            if len(args) == 1 and all(isinstance(v, int) for v in args):
+                self._global_range = list(args)
                 self._local_range = None
-            elif all(isinstance(v, int) for v in lr) and len(lr) != 0:
-                self._local_range = list(lr)
+            elif len(args) == 2:
+                gr = args[0]
+                lr = args[1]
+                if isinstance(gr, int):
+                    self._global_range = [gr]
+                elif len(gr) != 0 and all(isinstance(v, int) for v in gr):
+                    self._global_range = list(gr)
+                else:
+                    raise IllegalRangeValueError(kernel_name=self.kernel_name)
+
+                if isinstance(lr, int):
+                    self._local_range = [lr]
+                elif isinstance(lr, list) and len(lr) == 0:
+                    # deprecation warning
+                    warn(
+                        "Specifying the local range as an empty list "
+                        "(DEFAULT_LOCAL_SIZE) is deprecated. The kernel will "
+                        "be executed as a basic data-parallel kernel over the "
+                        "global range. Specify a valid local range to execute "
+                        "the kernel as an ND-range kernel.",
+                        DeprecationWarning,
+                        stacklevel=2,
+                    )
+                    self._local_range = None
+                elif len(lr) != 0 and all(isinstance(v, int) for v in lr):
+                    self._local_range = list(lr)
+                else:
+                    raise IllegalRangeValueError(kernel_name=self.kernel_name)
             else:
-                raise IllegalRangeValueError(kernel_name=self.kernel_name)
+                raise InvalidKernelLaunchArgsError(kernel_name=self.kernel_name)
         else:
-            raise IllegalRangeValueError(kernel_name=self.kernel_name)
+            raise InvalidKernelLaunchArgsError(kernel_name=self.kernel_name)
 
         # FIXME:[::-1] is done as OpenCL and SYCl have different orders when
         # it comes to specifying dimensions.
diff --git a/numba_dpex/tests/kernel_tests/test_kernel_launch_params.py b/numba_dpex/tests/kernel_tests/test_kernel_launch_params.py
new file mode 100644
index 0000000000..fa7658623d
--- /dev/null
+++ b/numba_dpex/tests/kernel_tests/test_kernel_launch_params.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import numba_dpex as dpex
+from numba_dpex.core.exceptions import (
+    IllegalRangeValueError,
+    InvalidKernelLaunchArgsError,
+)
+
+
+@dpex.kernel
+def vecadd(a, b, c):
+    i = dpex.get_global_id(0)
+    a[i] = b[i] + c[i]
+
+
+def test_1D_global_range_as_int():
+    k = vecadd[10]
+    assert k._global_range == [10]
+    assert k._local_range is None
+
+
+def test_1D_global_range_as_one_tuple():
+    k = vecadd[
+        10,
+    ]
+    assert k._global_range == [10]
+    assert k._local_range is None
+
+
+def test_1D_global_range_as_list():
+    k = vecadd[[10]]
+    assert k._global_range == [10]
+    assert k._local_range is None
+
+
+def test_1D_global_range_and_1D_local_range():
+    k = vecadd[10, 10]
+    assert k._global_range == [10]
+    assert k._local_range == [10]
+
+
+def test_1D_global_range_and_1D_local_range2():
+    k = vecadd[[10, 10]]
+    assert k._global_range == [10]
+    assert k._local_range == [10]
+
+
+def test_1D_global_range_and_1D_local_range3():
+    k = vecadd[(10,), (10,)]
+    assert k._global_range == [10]
+    assert k._local_range == [10]
+
+
+def test_2D_global_range_and_2D_local_range():
+    k = vecadd[(10, 10), (10, 10)]
+    assert k._global_range == [10, 10]
+    assert k._local_range == [10, 10]
+
+
+def test_2D_global_range_and_2D_local_range2():
+    k = vecadd[[10, 10], (10, 10)]
+    assert k._global_range == [10, 10]
+    assert k._local_range == [10, 10]
+
+
+def test_2D_global_range_and_2D_local_range3():
+    k = vecadd[(10, 10), [10, 10]]
+    assert k._global_range == [10, 10]
+    assert k._local_range == [10, 10]
+
+
+def test_2D_global_range_and_2D_local_range4():
+    k = vecadd[[10, 10], [10, 10]]
+    assert k._global_range == [10, 10]
+    assert k._local_range == [10, 10]
+
+
+def test_deprecation_warning_for_empty_local_range():
+    with pytest.deprecated_call():
+        k = vecadd[[10, 10], []]
+    assert k._global_range == [10, 10]
+    assert k._local_range is None
+
+
+def test_deprecation_warning_for_empty_local_range2():
+    with pytest.deprecated_call():
+        k = vecadd[10, []]
+    assert k._global_range == [10]
+    assert k._local_range is None
+
+
+def test_illegal_kernel_launch_arg():
+    with pytest.raises(InvalidKernelLaunchArgsError):
+        vecadd[10, 10, []]
+
+
+def test_illegal_range_error():
+    with pytest.raises(IllegalRangeValueError):
+        vecadd[[], []]
+
+
+def test_illegal_range_error2():
+    with pytest.raises(IllegalRangeValueError):
+        vecadd[[], 10]
+
+
+def test_illegal_range_error3():
+    with pytest.raises(IllegalRangeValueError):
+        vecadd[(), 10]