Skip to content

Commit

Permalink
Refactored kernel dispatch API.
Browse files Browse the repository at this point in the history
  - The concept of a kernel was decoupled from the notion of
    dispatching of a kernel. The present implementation in
    compiler.py intermixes both things, making hard the
    separation of compute-follows-data based kernel launch
    and legacy `dpctl.device_context` based behavior.
  - Deprecates support for numpy arrays as kernel args.
  - Deprecates support for the square bracket notation using
    `__getitem__` to provide global and local ranges for a
    kernel launch.
  - Changes the behavior of specializing a kernel using only
    a signature. The new way to specialize will require a
    device type and a backend.
  - Improvements to exception messages using custom exceptions.
  - The new API is now inside `numba_dpex.core.kernel_interface`.
  • Loading branch information
Diptorup Deb committed Oct 18, 2022
1 parent 231c285 commit 736bff3
Show file tree
Hide file tree
Showing 4 changed files with 577 additions and 0 deletions.
6 changes: 6 additions & 0 deletions numba_dpex/core/kernel_interface/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0

"""Defines the Kernel classes that abstract a SYCL device kernel.
"""
369 changes: 369 additions & 0 deletions numba_dpex/core/kernel_interface/dispatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,369 @@
# SPDX-FileCopyrightText: 2022 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0

import copy
from warnings import warn

import dpctl

from numba_dpex.core.descriptor import dpex_target
from numba_dpex.core.exceptions import (
ComputeFollowsDataInferenceError,
ExecutionQueueInferenceError,
InvalidKernelLaunchArgsError,
UnknownGlobalRangeError,
UnsupportedBackendError,
)
from numba_dpex.core.types import Array as ArrayType
from numba_dpex.dpctl_iface import USMNdArrayType


class KernelLauncher(object):
"""Creates a Kernel object from a @kernel decorated function and enqueues
the Kernel object on a specified device.
"""

def __init__(
self,
pyfunc,
debug_flags=None,
compile_flags=None,
array_access_specifiers=None,
):
self.typingctx = dpex_target.typing_context
self.pyfunc = pyfunc
self.debug_flags = debug_flags
self.compile_flags = compile_flags
self.kernel_name = pyfunc.__name__

if array_access_specifiers:
warn(
"Access specifiers apply only to NumPy ndarrays. "
+ "Support for NumPy ndarray objects as kernel arguments "
+ "and access specifiers flags is deprecated. "
+ "Use dpctl.tensor.usm_ndarray based arrays instead.",
DeprecationWarning,
stacklevel=2,
)
self.array_access_specifiers = array_access_specifiers

# def _ensure_valid_work_item_grid(self, val, sycl_queue):

# if not isinstance(val, (tuple, list, int)):
# error_message = (
# "Cannot create work item dimension from provided argument"
# )
# raise ValueError(error_message)

# if isinstance(val, int):
# val = [val]

# # TODO: we need some way to check the max dimensions
# """
# if len(val) > device_env.get_max_work_item_dims():
# error_message = ("Unsupported number of work item dimensions ")
# raise ValueError(error_message)
# """

# return list(
# val[::-1]
# ) # reversing due to sycl and opencl interop kernel range mismatch semantic

def _determine_compute_follows_data_queue(self, usm_array_list):
"""Determine the execution queue for the list of usm array args using
compute follows data rules.
Uses ``dpctl.utils.get_execution_queue()`` to check if the list of
queues belonging to the usm_ndarrays are equivalent. If the queues are
equivalent, then returns the queue. If the queues are not equivalent
then returns None.
Args:
usm_array_list : A list of usm_ndarray objects
Returns:
A queue the common queue used to allocate the arrays. If no such
queue exists, then returns None.
"""

queues = [usm_array.sycl_queue for usm_array in usm_array_list]
return dpctl.utils.get_execution_queue(queues)

def _determine_kernel_launch_queue(self, args):
"""Determines the queue where the kernel is to be launched.
The execution queue is derived using the following algorithm. In future,
support for ``numpy.ndarray`` and ``dpctl.device_context`` is to be
removed and queue derivation will follows Python Array API's
"compute follows data" logic.
Check if there are array arguments.
True:
Check if all array arguments are of type numpy.ndarray
(numba.types.Array)
True:
Check if the kernel was invoked from within a
dpctl.device_context.
True:
Provide a deprecation warning for device_context use and
point to using dpctl.tensor.usm_ndarray or dpnp.ndarray
return dpctl.get_current_queue
False:
Raise ExecutionQueueInferenceError
False:
Check if all of the arrays are USMNdarray
True:
Check if execution queue could be inferred using
compute follows data rules
True:
return the compute follows data inferred queue
False:
Raise ComputeFollowsDataInferenceError
False:
Raise ComputeFollowsDataInferenceError
False:
Check if the kernel was invoked from within a dpctl.device_context.
True:
Provide a deprecation warning for device_context use and
point to using dpctl.tensor.usm_ndarray of dpnp.ndarray
return dpctl.get_current_queue
False:
Raise ExecutionQueueInferenceError
Args:
args : A list of arguments passed to the kernel stored in the
launcher.
Returns:
A queue the common queue used to allocate the arrays. If no such
queue exists, then raises an Exception.
Raises:
ComputeFollowsDataInferenceError: If the queue could not be inferred
using compute follows data rules.
ExecutionQueueInferenceError: If the queue could not be inferred
using the dpctl queue manager.
"""
array_argnums = [
i
for i, arg in enumerate(args)
if isinstance(self.typingctx.resolve_argument_type(arg), ArrayType)
]
usmarray_argnums = [
i
for i, arg in enumerate(args)
if isinstance(
self.typingctx.resolve_argument_type(arg), USMNdArrayType
)
]
# if usm and non-usm array arguments are getting mixed, then the
# execution queue cannot be inferred using compute follows data rules.
if array_argnums and usmarray_argnums:
raise ComputeFollowsDataInferenceError(
array_argnums, usmarray_argnums
)
elif array_argnums and not usmarray_argnums:
if dpctl.is_in_device_context():
warn(
"Support for dpctl.device_context to specify the "
+ "execution queue is deprecated. "
+ "Use dpctl.tensor.usm_ndarray based array "
+ "containers instead. ",
DeprecationWarning,
stacklevel=2,
)
warn(
"Support for NumPy ndarray objects as kernel arguments is "
+ "deprecated. Use dpctl.tensor.usm_ndarray based array "
+ "containers instead. ",
DeprecationWarning,
stacklevel=2,
)
return dpctl.get_current_queue()
else:
raise ExecutionQueueInferenceError(self.kernel.name)
elif usmarray_argnums and not array_argnums:
if dpctl.is_in_device_context():
warn(
"dpctl.device_context ignored as the kernel arguments "
+ "are dpctl.tensor.usm_ndarray based array containers."
)
usm_array_args = [
arg for i, arg in enumerate(args) if i in usmarray_argnums
]
queue = self._determine_compute_follows_data_queue(usm_array_args)
if not queue:
raise ComputeFollowsDataInferenceError(
self.kernel.name, usmarray_argnums
)
else:
return queue
else:
if dpctl.is_in_device_context():
warn(
"Support for dpctl.device_context to specify the "
+ "execution queue is deprecated. "
+ "Use dpctl.tensor.usm_ndarray based array "
+ "containers instead. ",
DeprecationWarning,
stacklevel=2,
)
return dpctl.get_current_queue()
else:
raise ExecutionQueueInferenceError(self.kernel.name)

def __getitem__(self, args):
"""Mimic's ``numba.cuda`` square-bracket notation for configuring the
global_range and local_range settings when launching a kernel on a
SYCL queue.
When a Python function decorated with the @kernel decorator,
is invoked it creates a KernelLauncher object. Calling the
KernelLauncher objects ``__getitem__`` function inturn clones the object
and sets the ``global_range`` and optionally the ``local_range``
attributes with the arguments passed to ``__getitem__``.
Args:
args (tuple): A tuple of tuples that specify the global and
optionally the local range for the kernel execution. If the
argument is a two-tuple of tuple, then it is assumed that both
global and local range options are specified. The first entry is
considered to be the global range and the second the local range.
If only a single tuple value is provided, then the kernel is
launched with only a global range and the local range configuration
is decided by the SYCL runtime.
Returns:
KernelLauncher: A clone of the KernelLauncher object, but with the
global_range and local_range attributes initialized.
.. deprecated:: 0.19
Use :func:`KernelLauncher.execute` instead.
"""

warn(
"The [] (__getitem__) method to set global and local ranges for "
+ "launching a kernel is deprecated. "
+ "Use the execute function instead.",
DeprecationWarning,
stacklevel=2,
)

nargs = len(args)

# Check if the kernel launch arguments are sane.
if nargs < 1:
raise UnknownGlobalRangeError(kernel_name=self.kernel_name)
elif nargs > 2:
raise InvalidKernelLaunchArgsError(
kernel_name=self.kernel_name, args=args
)

self.global_range = args[0]
if nargs == 2 and args[1] != []:
self.local_range = args[1]
else:
self.local_range = None

return copy.copy(self)

def __call__(self, *args, global_range=None, local_range=None):
"""_summary_
Args:
global_range (_type_): _description_
local_range (_type_, optional): _description_. Defaults to None.
"""

# TODO: Move to separate function
if global_range:
if self.global_range:
warn(
"Ignoring the previously set value of global_range and "
+ "using the value specified at the kernel call site."
)
else:
if self.global_range:
warn(
"Use of __getitem__ to set the global_range attribute is "
+ 'deprecated. Use the keyword argument "global_range" of '
+ "__call__ method to set the attribute."
)
global_range = self.global_range

else:
raise UnknownGlobalRangeError(self.kernel_name)

if local_range:
if self.local_range:
warn(
"Ignoring the previously set value of local_range and "
+ "using the value specified at the kernel call site.."
)
else:
if self.local_range:
warn(
"Use of __getitem__ to set the local_range attribute is "
+ 'deprecated. Use the keyword argument "local_range" of '
+ "__call__ method to set the attribute."
)
local_range = self.local_range
else:
local_range = None
warn(
"Kernel to be submitted without a local range letting "
+ "the SYCL runtime select a local range. The behavior "
+ "can lead to suboptimal performance in certain cases. "
+ "Consider setting the local range value for the kernel "
+ "execution.\n"
+ "The local_range keyword may be made a required argument "
+ "in the future."
)
# TODO: Move out to separate function
exec_queue = self._determine_kernel_launch_queue(*args)
backend = exec_queue.backend
supported_backends = ["opencl", "level_zero"]
if exec_queue.backend not in [
dpctl.backend_type.opencl,
dpctl.backend_type.level_zero,
]:
raise UnsupportedBackendError(
self.kernel_name, backend, supported_backends
)
# TODO: Test global and local ranges to be valid for the device
# TODO: Create a kernel (or check if we already have a cached version)
# TODO: kernel.compile()

# TODO: submit the kernel
# kernelargs = []
# internal_device_arrs = []
# for ty, val, access_type in zip(
# self.argument_types, args, self.ordered_arg_access_types
# ):
# self._unpack_argument(
# ty,
# val,
# self.sycl_queue,
# kernelargs,
# internal_device_arrs,
# access_type,
# )

# self.sycl_queue.submit(
# self.kernel, kernelargs, self.global_size, self.local_size
# )
# self.sycl_queue.wait()

# for ty, val, i_dev_arr, access_type in zip(
# self.argument_types,
# args,
# internal_device_arrs,
# self.ordered_arg_access_types,
# ):
# self._pack_argument(
# ty, val, self.sycl_queue, i_dev_arr, access_type
# )
Loading

0 comments on commit 736bff3

Please sign in to comment.