From 38c2181f15d304011882ad9b87c2fe12fce35526 Mon Sep 17 00:00:00 2001 From: Diptorup Deb Date: Sat, 13 Jan 2024 12:21:41 -0600 Subject: [PATCH 01/12] Make the get_flattened_member_count public. --- numba_dpex/core/datamodel/models.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/numba_dpex/core/datamodel/models.py b/numba_dpex/core/datamodel/models.py index 2085be7eb8..7cf8a4eaa1 100644 --- a/numba_dpex/core/datamodel/models.py +++ b/numba_dpex/core/datamodel/models.py @@ -22,8 +22,9 @@ ) -def _get_flattened_member_count(ty): - """Return the number of fields in an instance of a given StructModel.""" +def get_flattened_member_count(ty): + """Returns the number of fields in an instance of a given StructModel.""" + flattened_member_count = 0 members = ty._members for member in members: @@ -109,7 +110,7 @@ def flattened_field_count(self): """ Return the number of fields in an instance of a USMArrayDeviceModel. """ - return _get_flattened_member_count(self) + return get_flattened_member_count(self) class USMArrayHostModel(StructModel): @@ -143,7 +144,7 @@ def __init__(self, dmm, fe_type): @property def flattened_field_count(self): """Return the number of fields in an instance of a USMArrayHostModel.""" - return _get_flattened_member_count(self) + return get_flattened_member_count(self) class SyclQueueModel(StructModel): @@ -223,7 +224,7 @@ def __init__(self, dmm, fe_type): @property def flattened_field_count(self): """Return the number of fields in an instance of a RangeModel.""" - return _get_flattened_member_count(self) + return get_flattened_member_count(self) class NdRangeModel(StructModel): @@ -246,7 +247,7 @@ def __init__(self, dmm, fe_type): @property def flattened_field_count(self): """Return the number of fields in an instance of a NdRangeModel.""" - return _get_flattened_member_count(self) + return get_flattened_member_count(self) def _init_data_model_manager() -> datamodel.DataModelManager: From a787b4133f396df07301e26b8e5103a64b8b46a8 Mon Sep 17 00:00:00 2001 From: Diptorup Deb Date: Tue, 13 Feb 2024 13:55:56 -0600 Subject: [PATCH 02/12] Adds a mock class for sycl::local_accessor --- numba_dpex/kernel_api/__init__.py | 10 +- numba_dpex/kernel_api/launcher.py | 43 ++++-- numba_dpex/kernel_api/local_accessor.py | 135 ++++++++++++++++++ .../tests/kernel_api/test_local_accessor.py | 47 ++++++ 4 files changed, 217 insertions(+), 18 deletions(-) create mode 100644 numba_dpex/kernel_api/local_accessor.py create mode 100644 numba_dpex/tests/kernel_api/test_local_accessor.py diff --git a/numba_dpex/kernel_api/__init__.py b/numba_dpex/kernel_api/__init__.py index 4ff9ec742a..a6da7b009c 100644 --- a/numba_dpex/kernel_api/__init__.py +++ b/numba_dpex/kernel_api/__init__.py @@ -14,21 +14,25 @@ from .barrier import group_barrier from .index_space_ids import Group, Item, NdItem from .launcher import call_kernel +from .local_accessor import LocalAccessor from .memory_enums import AddressSpace, MemoryOrder, MemoryScope from .private_array import PrivateArray from .ranges import NdRange, Range __all__ = [ + "call_kernel", + "group_barrier", "AddressSpace", "atomic_fence", "AtomicRef", + "Group", + "Item", + "LocalAccessor", "MemoryOrder", "MemoryScope", + "NdItem", "NdRange", "Range", - "Group", - "NdItem", - "Item", "PrivateArray", "group_barrier", "call_kernel", diff --git a/numba_dpex/kernel_api/launcher.py b/numba_dpex/kernel_api/launcher.py index 98a293e52b..6c746a46bb 100644 --- a/numba_dpex/kernel_api/launcher.py +++ b/numba_dpex/kernel_api/launcher.py @@ -9,6 +9,7 @@ from itertools import product from .index_space_ids import Group, Item, NdItem +from .local_accessor import LocalAccessor, _LocalAccessorMock from .ranges import NdRange, Range @@ -33,6 +34,12 @@ def _range_kernel_launcher(kernel_fn, index_range, *kernel_args): range_sets = [range(ir) for ir in index_range] index_tuples = list(product(*range_sets)) + for karg in kernel_args: + if isinstance(karg, LocalAccessor): + raise TypeError( + "LocalAccessor arguments are only supported for NdRange kernels" + ) + for idx in index_tuples: it = Item(extent=index_range, index=idx) @@ -66,6 +73,12 @@ def _ndrange_kernel_launcher(kernel_fn, index_range, *kernel_args): local_index_tuples = list(product(*local_range_sets)) group_index_tuples = list(product(*group_range_sets)) + modified_kernel_args = [] + for karg in kernel_args: + if isinstance(karg, LocalAccessor): + karg = _LocalAccessorMock(karg) + modified_kernel_args.append(karg) + # Loop over the groups (parallel loop) for gidx in group_index_tuples: # loop over work items in the group (parallel loop) @@ -76,27 +89,27 @@ def _ndrange_kernel_launcher(kernel_fn, index_range, *kernel_args): global_id.append( gidx_val * index_range.local_range[dim] + lidx[dim] ) - # Every NdItem has its own global Item, local Item and Group - nditem = NdItem( - global_item=Item( - extent=index_range.global_range, index=global_id - ), - local_item=Item(extent=index_range.local_range, index=lidx), - group=Group( - index_range.global_range, - index_range.local_range, - group_range, - gidx, - ), - ) - if len(signature(kernel_fn).parameters) - len(kernel_args) != 1: raise ValueError( "Required number of kernel function arguments do not " "match provided number of kernel args" ) - kernel_fn(nditem, *kernel_args) + kernel_fn( + NdItem( + global_item=Item( + extent=index_range.global_range, index=global_id + ), + local_item=Item(extent=index_range.local_range, index=lidx), + group=Group( + index_range.global_range, + index_range.local_range, + group_range, + gidx, + ), + ), + *modified_kernel_args + ) def call_kernel(kernel_fn, index_range, *kernel_args): diff --git a/numba_dpex/kernel_api/local_accessor.py b/numba_dpex/kernel_api/local_accessor.py new file mode 100644 index 0000000000..7bd3f7fb64 --- /dev/null +++ b/numba_dpex/kernel_api/local_accessor.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +"""Implements a Python analogue to SYCL's local_accessor class. The class is +intended to be used in pure Python code when prototyping a kernel function +and to be passed to an actual kernel function for local memory allocation. +""" +import numpy + + +class LocalAccessor: + """ + The ``LocalAccessor`` class is analogous to SYCL's ``local_accessor`` + class. The class acts a s proxy to allocating device local memory and + accessing that memory from within a :func:`numba_dpex.kernel` decorated + function. + """ + + def _verify_positive_integral_list(self, ls): + """Checks if all members of a list are positive integers.""" + + ret = False + try: + ret = all(int(val) > 0 for val in ls) + except ValueError: + pass + + return ret + + def __init__(self, shape, dtype) -> None: + """Creates a new LocalAccessor instance of the given shape and dtype.""" + + if not isinstance(shape, (list, tuple)): + if hasattr(shape, "tolist"): + fn = getattr(shape, "tolist") + if callable(fn): + self._shape = shape.tolist() + else: + try: + self._shape = [ + shape, + ] + except Exception as e: + raise TypeError( + "Argument shape must a non-negative integer, " + "or a list/tuple of such integers." + ) from e + else: + self._shape = list(shape) + + # Make sure shape is made up a supported types + if not self._verify_positive_integral_list(self._shape): + raise TypeError( + "Argument shape must a non-negative integer, " + "or a list/tuple of such integers." + ) + + # Make sure shape has a rank between (1..3) + if len(self._shape) < 1 or len(self._shape) > 3: + raise TypeError("LocalAccessor can only have up to 3 dimensions.") + + self._dtype = dtype + + if self._dtype not in [ + numpy.float32, + numpy.float64, + numpy.int32, + numpy.int64, + numpy.int16, + numpy.int8, + numpy.uint32, + numpy.uint64, + numpy.uint16, + numpy.uint8, + ]: + raise TypeError( + f"Argument dtype {dtype} is not supported. numpy.float32, " + "numpy.float64, numpy.[u]int8, numpy.[u]int16, numpy.[u]int32, " + "numpy.[u]int64 are the currently supported dtypes." + ) + + self._data = numpy.empty(self._shape, dtype=self._dtype) + + def __getitem__(self, idx_obj): + """Returns the value stored at the position represented by idx_obj in + the self._data ndarray. + """ + + raise NotImplementedError( + "The data of a LocalAccessor object can only be accessed " + "inside a kernel." + ) + + def __setitem__(self, idx_obj, val): + """Assigns a new value to the position represented by idx_obj in + the self._data ndarray. + """ + + raise NotImplementedError( + "The data of a LocalAccessor object can only be accessed " + "inside a kernel." + ) + + +class _LocalAccessorMock: + """Mock class that is used to represent a local accessor inside a "kernel". + + A LocalAccessor represents a device-only memory allocation and the + class is designed in a way to not have any data container backing up the + actual memory storage. Instead, the _LocalAccessorMock class is used to + represent a local_accessor that has an actual numpy ndarray backing it up. + Whenever, a LocalAccessor object is passed to `func`:kernel_api.call_kernel` + it is converted to a _LocalAccessor internally. That way the data and + access function on the data only works inside a kernel to simulate + device-only memory allocation and outside the kernel the data for a + LocalAccessor is not accessible. + """ + + def __init__(self, local_accessor: LocalAccessor): + self._data = local_accessor._data + + def __getitem__(self, idx_obj): + """Returns the value stored at the position represented by idx_obj in + the self._data ndarray. + """ + + return self._data[idx_obj] + + def __setitem__(self, idx_obj, val): + """Assigns a new value to the position represented by idx_obj in + the self._data ndarray. + """ + + self._data[idx_obj] = val diff --git a/numba_dpex/tests/kernel_api/test_local_accessor.py b/numba_dpex/tests/kernel_api/test_local_accessor.py new file mode 100644 index 0000000000..0f7a151f9c --- /dev/null +++ b/numba_dpex/tests/kernel_api/test_local_accessor.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: 2024 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import numpy +import pytest + +from numba_dpex import kernel_api as kapi + + +def _slm_kernel(nd_item: kapi.NdItem, a, slm): + i = nd_item.get_global_linear_id() + j = nd_item.get_local_linear_id() + + slm[j] = 100 + a[i] = slm[i] + + +def test_local_accessor_data_inaccessible_outside_kernel(): + la = kapi.LocalAccessor((100,), dtype=numpy.float32) + + with pytest.raises(NotImplementedError): + print(la[0]) + + with pytest.raises(NotImplementedError): + la[0] = 10 + + +def test_local_accessor_use_inside_kernel(): + + a = numpy.empty(32) + slm = kapi.LocalAccessor(32, dtype=a.dtype) + + # launches one work group with 32 work item. Each work item initializes its + # position in the SLM to 100 and then writes it to the global array `a`. + kapi.call_kernel(_slm_kernel, kapi.NdRange((32,), (32,)), a, slm) + + assert numpy.all(a == 100) + + +def test_local_accessor_usage_not_allowed_with_range_kernel(): + + a = numpy.empty(32) + slm = kapi.LocalAccessor(32, dtype=a.dtype) + + with pytest.raises(TypeError): + kapi.call_kernel(_slm_kernel, kapi.Range((32,)), a, slm) From be9d25e1d5a99b3ebddc5b5a87809db566efa8ba Mon Sep 17 00:00:00 2001 From: Diptorup Deb Date: Wed, 21 Feb 2024 12:48:58 -0600 Subject: [PATCH 03/12] Add numba typing infrastructure for LocalAccessor --- .../core/types/kernel_api/local_accessor.py | 66 +++++++++++++++++++ numba_dpex/experimental/__init__.py | 1 + numba_dpex/experimental/models.py | 9 ++- numba_dpex/experimental/typeof.py | 16 ++++- 4 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 numba_dpex/core/types/kernel_api/local_accessor.py diff --git a/numba_dpex/core/types/kernel_api/local_accessor.py b/numba_dpex/core/types/kernel_api/local_accessor.py new file mode 100644 index 0000000000..9d82ea9802 --- /dev/null +++ b/numba_dpex/core/types/kernel_api/local_accessor.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: 2024 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from numba.core.pythonapi import unbox +from numba.core.types import Array, Type +from numba.np import numpy_support + +from numba_dpex.core.types import USMNdArray +from numba_dpex.utils import address_space as AddressSpace + + +class LocalAccessorType(USMNdArray): + """numba-dpex internal type to represent a Python object of + :class:`numba_dpex.experimental.kernel_iface.LocalAccessor`. + """ + + def __init__(self, ndim, dtype): + try: + if isinstance(dtype, Type): + parsed_dtype = dtype + else: + parsed_dtype = numpy_support.from_dtype(dtype) + except NotImplementedError as exc: + raise ValueError(f"Unsupported array dtype: {dtype}") from exc + + type_name = ( + f"LocalAccessor(dtype={parsed_dtype}, ndim={ndim}, " + f"address_space={AddressSpace.LOCAL})" + ) + + super().__init__( + ndim=ndim, + layout="C", + dtype=parsed_dtype, + addrspace=AddressSpace.LOCAL, + name=type_name, + ) + + def cast_python_value(self, args): + """The helper function is not overloaded and using it on the + LocalAccessorType throws a NotImplementedError. + """ + raise NotImplementedError + + +@unbox(LocalAccessorType) +def unbox_local_accessor(typ, obj, c): # pylint: disable=unused-argument + """Unboxes a Python LocalAccessor PyObject* into a numba-dpex internal + representation. + + A LocalAccessor object is represented internally in numba-dpex with the + same data model as a numpy.ndarray. It is done as a LocalAccessor object + serves only as a placeholder type when passed to ``call_kernel`` and the + data buffer should never be accessed inside a host-side compiled function + such as ``call_kernel``. + + When a LocalAccessor object is passed as an argument to a kernel function + it uses the USMArrayDeviceModel. Doing so allows numba-dpex to correctly + generate the kernel signature passing in a pointer in the local address + space. + """ + + nparrobj = c.pyapi.object_getattr_string(obj, "_data") + nparrtype = Array(typ.dtype, typ.ndim, typ.layout, readonly=False) + return c.unbox(nparrtype, nparrobj) diff --git a/numba_dpex/experimental/__init__.py b/numba_dpex/experimental/__init__.py index f764e40843..97134ee042 100644 --- a/numba_dpex/experimental/__init__.py +++ b/numba_dpex/experimental/__init__.py @@ -12,6 +12,7 @@ from numba_dpex.core.boxing import * from numba_dpex.kernel_api_impl.spirv.dispatcher import SPIRVKernelDispatcher +from . import typeof from ._kernel_dpcpp_spirv_overloads import ( _atomic_fence_overloads, _atomic_ref_overloads, diff --git a/numba_dpex/experimental/models.py b/numba_dpex/experimental/models.py index f9e7a2f53d..48119d322a 100644 --- a/numba_dpex/experimental/models.py +++ b/numba_dpex/experimental/models.py @@ -8,7 +8,7 @@ from numba.core import types from numba.core.datamodel import DataModelManager, models -from numba.core.datamodel.models import StructModel +from numba.core.datamodel.models import ArrayModel, StructModel from numba.core.extending import register_model import numba_dpex.core.datamodel.models as dpex_core_models @@ -19,6 +19,7 @@ ) from ..core.types.kernel_api.atomic_ref import AtomicRefType +from ..core.types.kernel_api.local_accessor import LocalAccessorType from .types import KernelDispatcherType @@ -60,6 +61,8 @@ def _init_exp_data_model_manager() -> DataModelManager: # Register the types and data model in the DpexExpTargetContext dmm.register(AtomicRefType, AtomicRefModel) + dmm.register(LocalAccessorType, dpex_core_models.USMArrayDeviceModel) + # Register the GroupType type dmm.register(GroupType, EmptyStructModel) @@ -85,3 +88,7 @@ def _init_exp_data_model_manager() -> DataModelManager: # Register the NdItemType type register_model(NdItemType)(EmptyStructModel) + +# The LocalAccessorType is registered with the EmptyStructModel in the default +# data manager so that its attributes are not accessible inside dpjit. +register_model(LocalAccessorType)(ArrayModel) diff --git a/numba_dpex/experimental/typeof.py b/numba_dpex/experimental/typeof.py index e72c951a0f..dcdcd1deb9 100644 --- a/numba_dpex/experimental/typeof.py +++ b/numba_dpex/experimental/typeof.py @@ -14,7 +14,8 @@ ItemType, NdItemType, ) -from numba_dpex.kernel_api import AtomicRef, Group, Item, NdItem +from numba_dpex.core.types.kernel_api.local_accessor import LocalAccessorType +from numba_dpex.kernel_api import AtomicRef, Group, Item, LocalAccessor, NdItem from ..core.types.kernel_api.atomic_ref import AtomicRefType @@ -84,3 +85,16 @@ def typeof_nditem(val: NdItem, c): instance. """ return NdItemType(val.dimensions) + + +@typeof_impl.register(LocalAccessor) +def typeof_local_accessor(val: LocalAccessor, c) -> LocalAccessorType: + """Returns a ``numba_dpex.experimental.dpctpp_types.LocalAccessorType`` + instance for a Python LocalAccessor object. + Args: + val (LocalAccessor): Instance of the LocalAccessor type. + c : Numba typing context used for type inference. + Returns: LocalAccessorType object corresponding to the LocalAccessor object. + """ + # pylint: disable=protected-access + return LocalAccessorType(ndim=val._data.ndim, dtype=val._data.dtype) From b89365fc8dbf8e6002cd9cb566d4ce8dfc1f2e2e Mon Sep 17 00:00:00 2001 From: Diptorup Deb Date: Fri, 15 Mar 2024 12:25:55 -0500 Subject: [PATCH 04/12] Updates the kernel_launcher API to handle LocalAccessor --- .../utils/kernel_flattened_args_builder.py | 168 +++++++++++++++++- numba_dpex/core/utils/kernel_launcher.py | 5 +- 2 files changed, 165 insertions(+), 8 deletions(-) diff --git a/numba_dpex/core/utils/kernel_flattened_args_builder.py b/numba_dpex/core/utils/kernel_flattened_args_builder.py index 1df93722db..dd30be38fb 100644 --- a/numba_dpex/core/utils/kernel_flattened_args_builder.py +++ b/numba_dpex/core/utils/kernel_flattened_args_builder.py @@ -9,12 +9,14 @@ from typing import NamedTuple +import dpctl from llvmlite import ir as llvmir from numba.core import types from numba.core.cpu import CPUContext from numba_dpex import utils from numba_dpex.core.types import USMNdArray +from numba_dpex.core.types.kernel_api.local_accessor import LocalAccessorType from numba_dpex.dpctl_iface._helpers import numba_type_to_dpctl_typenum @@ -70,7 +72,7 @@ def add_argument( arg_type, arg_packed_llvm_val, ): - """Add kernel argument that need to be flatten.""" + """Add flattened representation of a kernel argument.""" if isinstance(arg_type, USMNdArray): self._kernel_arg_list.extend( self._build_array_arg( @@ -118,6 +120,40 @@ def print_kernel_arg_list(self) -> None: for karg in args_list: print(f" {karg.llvm_val} of typeid {karg.typeid}") + def _allocate_local_accessor_metadata_struct(self): + """Allocates a struct into the current function to store the metadata + that should be passed to libsyclinterface to allocate a + sycl::local_accessor object. The constructor of the sycl::local_accessor + class is: local_accessor(range r). + + For this reason, the struct is allocated as: + + LOCAL_ACCESSOR_MDSTRUCT_TYPE = llvmir.LiteralStructType( + [ + llvmir.IntType(64), # Ndim (0..3] + llvmir.IntType(32), # typeid + llvmir.IntType(64), # Dim0 extent + llvmir.IntType(64), # Dim1 extent or NULL + llvmir.IntType(64), # Dim2 extent or NULL + ] + ) + """ + local_accessor_mdstruct_type = llvmir.LiteralStructType( + [ + llvmir.IntType(64), + llvmir.IntType(32), + llvmir.IntType(64), + llvmir.IntType(64), + llvmir.IntType(64), + ] + ) + + struct_ref = None + with self._builder.goto_entry_block(): + struct_ref = self._builder.alloca(typ=local_accessor_mdstruct_type) + + return struct_ref + def _build_arg(self, llvm_val, numba_type): """Returns a KernelArg to be passed to a DPCTLQueue_Submit call. @@ -213,6 +249,114 @@ def _store_val_into_struct(self, struct_ref, index, val): ), ) + def _build_local_accessor_metadata_arg( + self, llvm_val, arg_type, data_attr_ty + ): + """Handles the special case of building the kernel argument for the data + attribute of a kernel_api.LocalAccessor object. + + A kernel_api.LocalAccessor conceptually represents a device-only memory + allocation. The mock kernel_api.LocalAccessor uses a numpy.ndarray to + represent the data allocation. The numpy.ndarray cannot be passed to the + kernel and is ignored when building the kernel argument. Instead, a + struct is allocated to store the metadata about the size of the device + memory allocation and a reference to the struct is passed to the + DPCTLQueue_Submit call. The DPCTLQueue_Submit then constructs a + sycl::local_accessor object using the metadata and passes the + sycl::local_accessor as the kernel argument, letting the DPC++ runtime + handle proper device memory allocation. + """ + + kernel_data_model = self._kernel_dmm.lookup(arg_type) + host_data_model = self._context.data_model_manager.lookup(arg_type) + shape_member = kernel_data_model.get_member_fe_type("shape") + shape_member_pos = host_data_model.get_field_position("shape") + ndim = shape_member.count + + mdstruct_ref = self._allocate_local_accessor_metadata_struct() + + # Store the number of dimensions in the local accessor + self._store_val_into_struct( + mdstruct_ref, + index=0, + val=self._context.get_constant(types.int64, ndim), + ) + # Get the underlying dtype of the data (a CPointer) attribute of a + # local_accessor object + self._store_val_into_struct( + mdstruct_ref, + index=1, + val=numba_type_to_dpctl_typenum(self._context, data_attr_ty.dtype), + ) + # Extract and store the shape values from array into mdstruct + shape_attr = self._builder.gep( + llvm_val, + [ + self._context.get_constant(types.int32, 0), + self._context.get_constant(types.int32, shape_member_pos), + ], + ) + # Store the extent of the 1st dimension of the local accessor + dim0_shape_ext = self._builder.gep( + shape_attr, + [ + self._context.get_constant(types.int32, 0), + self._context.get_constant(types.int32, 0), + ], + ) + self._store_val_into_struct( + mdstruct_ref, + index=2, + val=self._builder.load(dim0_shape_ext), + ) + + if ndim == 2: + dim1_shape_ext = self._builder.gep( + shape_attr, + [ + self._context.get_constant(types.int32, 0), + self._context.get_constant(types.int32, 1), + ], + ) + self._store_val_into_struct( + mdstruct_ref, + index=3, + val=self._builder.load(dim1_shape_ext), + ) + else: + self._store_val_into_struct( + mdstruct_ref, + index=3, + val=self._context.get_constant(types.int64, 1), + ) + + if ndim == 3: + dim2_shape_ext = self._builder.gep( + shape_attr, + [ + self._context.get_constant(types.int32, 0), + self._context.get_constant(types.int32, 2), + ], + ) + self._store_val_into_struct( + mdstruct_ref, + index=4, + val=self._builder.load(dim2_shape_ext), + ) + else: + self._store_val_into_struct( + mdstruct_ref, + index=4, + val=self._context.get_constant(types.int64, 1), + ) + + return self._build_arg( + llvm_val=mdstruct_ref, + numba_type=LocalAccessorType( + ndim, dpctl.tensor.dtype(data_attr_ty.dtype.name) + ), + ) + def _build_array_arg(self, arg_type, llvm_array_val): """Creates a list of LLVM Values for an unpacked USMNdArray kernel argument. @@ -240,13 +384,23 @@ def _build_array_arg(self, arg_type, llvm_array_val): # Argument data data_attr_pos = host_data_model.get_field_position("data") data_attr_ty = kernel_data_model.get_member_fe_type("data") - kernel_arg_list.extend( - self._build_collections_attr_arg( - llvm_val=llvm_array_val, - attr_index=data_attr_pos, - attr_type=data_attr_ty, + + if isinstance(arg_type, LocalAccessorType): + kernel_arg_list.extend( + self._build_local_accessor_metadata_arg( + llvm_val=llvm_array_val, + arg_type=arg_type, + data_attr_ty=data_attr_ty, + ) + ) + else: + kernel_arg_list.extend( + self._build_collections_attr_arg( + llvm_val=llvm_array_val, + attr_index=data_attr_pos, + attr_type=data_attr_ty, + ) ) - ) # Arguments for shape kernel_arg_list.extend( self._build_unituple_member_arg( diff --git a/numba_dpex/core/utils/kernel_launcher.py b/numba_dpex/core/utils/kernel_launcher.py index d0f9426d88..f20a58efc6 100644 --- a/numba_dpex/core/utils/kernel_launcher.py +++ b/numba_dpex/core/utils/kernel_launcher.py @@ -21,6 +21,7 @@ from numba_dpex.core.exceptions import UnreachableError from numba_dpex.core.runtime.context import DpexRTContext from numba_dpex.core.types import USMNdArray +from numba_dpex.core.types.kernel_api.local_accessor import LocalAccessorType from numba_dpex.core.types.kernel_api.ranges import NdRangeType, RangeType from numba_dpex.core.utils.kernel_flattened_args_builder import ( KernelFlattenedArgsBuilder, @@ -675,7 +676,9 @@ def get_queue_from_llvm_values( the queue from the first USMNdArray argument can be extracted. """ for arg_num, argty in enumerate(ty_kernel_args): - if isinstance(argty, USMNdArray): + if isinstance(argty, USMNdArray) and not isinstance( + argty, LocalAccessorType + ): llvm_val = ll_kernel_args[arg_num] datamodel = ctx.data_model_manager.lookup(argty) sycl_queue_attr_pos = datamodel.get_field_position("sycl_queue") From 4f5966075fcb5cd38bf4d4f2fa01e3907a554998 Mon Sep 17 00:00:00 2001 From: Diptorup Deb Date: Fri, 15 Mar 2024 12:27:33 -0500 Subject: [PATCH 05/12] Update numba_type_to_dpctl_typenum to handle LocalAccessor --- numba_dpex/dpctl_iface/_helpers.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/numba_dpex/dpctl_iface/_helpers.py b/numba_dpex/dpctl_iface/_helpers.py index f46915eaf0..cd72014d84 100644 --- a/numba_dpex/dpctl_iface/_helpers.py +++ b/numba_dpex/dpctl_iface/_helpers.py @@ -5,6 +5,7 @@ from numba.core import types from numba_dpex import dpctl_sem_version +from numba_dpex.core.types.kernel_api.local_accessor import LocalAccessorType def numba_type_to_dpctl_typenum(context, ty): @@ -34,6 +35,10 @@ def numba_type_to_dpctl_typenum(context, ty): return context.get_constant( types.int32, kargty.dpctl_void_ptr.value ) + elif isinstance(ty, LocalAccessorType): + return context.get_constant( + types.int32, kargty.dpctl_local_accessor.value + ) else: raise NotImplementedError else: @@ -61,5 +66,9 @@ def numba_type_to_dpctl_typenum(context, ty): elif ty == types.voidptr or isinstance(ty, types.CPointer): # DPCTL_VOID_PTR return context.get_constant(types.int32, 15) + elif isinstance(ty, LocalAccessorType): + raise NotImplementedError( + "LocalAccessor args for kernels requires dpctl 0.17 or greater." + ) else: raise NotImplementedError From da0d6936337750047b2aa66276b947674b0a20b0 Mon Sep 17 00:00:00 2001 From: Diptorup Deb Date: Fri, 8 Mar 2024 23:10:36 -0600 Subject: [PATCH 06/12] Add a unit test for local accessor --- .../spv_overloads/test_local_accessors.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py diff --git a/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py b/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py new file mode 100644 index 0000000000..6721c830e3 --- /dev/null +++ b/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: 2023 - 2024 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + + +import dpnp +import pytest + +import numba_dpex as dpex +import numba_dpex.experimental as dpex_exp +from numba_dpex.kernel_api import ( + LocalAccessor, + MemoryScope, + NdItem, + group_barrier, +) +from numba_dpex.tests._helper import get_all_dtypes + +list_of_supported_dtypes = get_all_dtypes( + no_bool=True, no_float16=True, no_none=True, no_complex=True +) + + +@pytest.mark.parametrize("supported_dtype", list_of_supported_dtypes) +def test_local_accessor(supported_dtype): + """A test for passing a LocalAccessor object as a kernel argument.""" + + @dpex_exp.kernel + def _kernel(nd_item: NdItem, a, slm): + i = nd_item.get_global_linear_id() + j = nd_item.get_local_linear_id() + + slm[j] = 0 + group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) + + for m in range(100): + slm[j] += i * m + group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) + + a[i] = slm[j] + + N = 32 + a = dpnp.empty(N, dtype=supported_dtype) + slm = LocalAccessor((32 * 64), dtype=a.dtype) + + # A single work group with 32 work items is launched. Each work item + # computes the sum of (0..99) * its get_global_linear_id i.e., + # `4950 * get_global_linear_id` and stores it into the work groups local + # memory. The local memory is of size 32*64 elements of the requested dtype. + # The result is then stored into `a` in global memory + dpex_exp.call_kernel(_kernel, dpex.NdRange((N,), (32,)), a, slm) + + for idx in range(N): + assert a[idx] == 4950 * idx From 0e29928d7a2c4ea5d196c89c11aa15a28ad2a5e2 Mon Sep 17 00:00:00 2001 From: Diptorup Deb Date: Sun, 10 Mar 2024 01:36:13 -0600 Subject: [PATCH 07/12] Add a codegen unit test for local accessor kernel arg. --- .../codegen/test_local_accessor_kernel_arg.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 numba_dpex/tests/experimental/codegen/test_local_accessor_kernel_arg.py diff --git a/numba_dpex/tests/experimental/codegen/test_local_accessor_kernel_arg.py b/numba_dpex/tests/experimental/codegen/test_local_accessor_kernel_arg.py new file mode 100644 index 0000000000..26905bf19b --- /dev/null +++ b/numba_dpex/tests/experimental/codegen/test_local_accessor_kernel_arg.py @@ -0,0 +1,67 @@ +# SPDX-FileCopyrightText: 2023 - 2024 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpctl +from llvmlite import ir as llvmir +from numba.core import types + +from numba_dpex import DpctlSyclQueue, DpnpNdArray +from numba_dpex import experimental as dpex_exp +from numba_dpex import int64 +from numba_dpex.core.types.kernel_api.index_space_ids import NdItemType +from numba_dpex.core.types.kernel_api.local_accessor import LocalAccessorType +from numba_dpex.kernel_api import ( + AddressSpace, + MemoryScope, + NdItem, + group_barrier, +) + + +def kernel_func(nd_item: NdItem, a, slm): + i = nd_item.get_global_linear_id() + j = nd_item.get_local_linear_id() + + slm[j] = 100 + group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) + + a[i] += slm[j] + + +def test_codegen_local_accessor_kernel_arg(): + """Tests if a kernel with a local accessor argument is generated with + expected local address space pointer argument. + """ + + queue_ty = DpctlSyclQueue(dpctl.SyclQueue()) + i64arr_ty = DpnpNdArray(ndim=1, dtype=int64, layout="C", queue=queue_ty) + slm_ty = LocalAccessorType(ndim=1, dtype=int64) + disp = dpex_exp.kernel(inline_threshold=3)(kernel_func) + dmm = disp.targetctx.data_model_manager + + i64arr_ty_flattened_arg_count = dmm.lookup(i64arr_ty).flattened_field_count + slm_ty_model = dmm.lookup(slm_ty) + slm_ty_flattened_arg_count = slm_ty_model.flattened_field_count + slm_ptr_pos = slm_ty_model.get_field_position("data") + + llargtys = disp.targetctx.get_arg_packer([i64arr_ty, slm_ty]).argument_types + + # Go over all the arguments to the spir_kernel_func and assert two things: + # a) Number of arguments == i64arr_ty_flattened_arg_count + # + slm_ty_flattened_arg_count + # b) The argument corresponding to the data attribute of the local accessor + # argument is a pointer in address space local address space + + num_kernel_args = 0 + slm_data_ptr_arg = None + for kernel_arg in llargtys: + if num_kernel_args == i64arr_ty_flattened_arg_count + slm_ptr_pos: + slm_data_ptr_arg = kernel_arg + num_kernel_args += 1 + assert ( + num_kernel_args + == i64arr_ty_flattened_arg_count + slm_ty_flattened_arg_count + ) + assert isinstance(slm_data_ptr_arg, llvmir.PointerType) + assert slm_data_ptr_arg.addrspace == AddressSpace.LOCAL From 3b93e8c51e8f4b3a06d8be5e4deaa5632a21a60c Mon Sep 17 00:00:00 2001 From: Diptorup Deb Date: Sun, 10 Mar 2024 23:16:59 -0500 Subject: [PATCH 08/12] Disallow LocalAccessor arguments to RangeType kernels --- numba_dpex/experimental/launcher.py | 30 ++++++++++++++ .../spv_overloads/test_local_accessors.py | 41 +++++++++++++------ 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/numba_dpex/experimental/launcher.py b/numba_dpex/experimental/launcher.py index 82809a4c9e..44827835e5 100644 --- a/numba_dpex/experimental/launcher.py +++ b/numba_dpex/experimental/launcher.py @@ -25,6 +25,7 @@ ItemType, NdItemType, ) +from numba_dpex.core.types.kernel_api.local_accessor import LocalAccessorType from numba_dpex.core.utils import kernel_launcher as kl from numba_dpex.dpctl_iface import libsyclinterface_bindings as sycl from numba_dpex.dpctl_iface.wrappers import wrap_event_reference @@ -42,6 +43,23 @@ class _LLRange(NamedTuple): local_range_extents: list +def _has_a_local_accessor_argument(args): + """Checks if there exists at least one LocalAccessorType object in the + input tuple. + + Args: + args (_type_): A tuple of numba.core.Type objects + + Returns: + bool : True if at least one LocalAccessorType object was found, + otherwise False. + """ + for arg in args: + if isinstance(arg, LocalAccessorType): + return True + return False + + def _wrap_event_reference_tuple(ctx, builder, event1, event2): """Creates tuple data model from two event data models, so it can be boxed to Python.""" @@ -153,6 +171,18 @@ def _submit_kernel( # pylint: disable=too-many-arguments DeprecationWarning, ) + # Validate local accessor arguments are passed only to a kernel that is + # launched with an NdRange index space. Reference section 4.7.6.11. of the + # SYCL 2020 specification: A local_accessor must not be used in a SYCL + # kernel function that is invoked via single_task or via the simple form of + # parallel_for that takes a range parameter. + if _has_a_local_accessor_argument(ty_kernel_args_tuple) and isinstance( + ty_index_space, RangeType + ): + raise TypeError( + "A RangeType kernel cannot have a LocalAccessor argument" + ) + # ty_kernel_fn is type specific to exact function, so we can get function # directly from type and compile it. Thats why we don't need to get it in # codegen diff --git a/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py b/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py index 6721c830e3..1ce33c63f5 100644 --- a/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py +++ b/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py @@ -5,6 +5,7 @@ import dpnp import pytest +from numba.core.errors import TypingError import numba_dpex as dpex import numba_dpex.experimental as dpex_exp @@ -21,23 +22,24 @@ ) -@pytest.mark.parametrize("supported_dtype", list_of_supported_dtypes) -def test_local_accessor(supported_dtype): - """A test for passing a LocalAccessor object as a kernel argument.""" +@dpex_exp.kernel +def _kernel(nd_item: NdItem, a, slm): + i = nd_item.get_global_linear_id() + j = nd_item.get_local_linear_id() - @dpex_exp.kernel - def _kernel(nd_item: NdItem, a, slm): - i = nd_item.get_global_linear_id() - j = nd_item.get_local_linear_id() + slm[j] = 0 + group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) - slm[j] = 0 + for m in range(100): + slm[j] += i * m group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) - for m in range(100): - slm[j] += i * m - group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) + a[i] = slm[j] - a[i] = slm[j] + +@pytest.mark.parametrize("supported_dtype", list_of_supported_dtypes) +def test_local_accessor(supported_dtype): + """A test for passing a LocalAccessor object as a kernel argument.""" N = 32 a = dpnp.empty(N, dtype=supported_dtype) @@ -52,3 +54,18 @@ def _kernel(nd_item: NdItem, a, slm): for idx in range(N): assert a[idx] == 4950 * idx + + +def test_local_accessor_argument_to_range_kernel(): + """Checks if an exception is raised when passing a local accessor to a + RangeType kernel. + """ + N = 32 + a = dpnp.empty(N) + slm = LocalAccessor((32 * 64), dtype=a.dtype) + + # Passing a local_accessor to a RangeType kernel should raise an exception. + # A TypeError is raised if NUMBA_CAPTURED_ERROR=new_style and a + # numba.TypingError is raised if NUMBA_CAPTURED_ERROR=old_style + with pytest.raises((TypeError, TypingError)): + dpex_exp.call_kernel(_kernel, dpex.Range(N), a, slm) From fe8d1d2f183f91d8ba90c8815da76aa9d8b25df4 Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Tue, 19 Mar 2024 11:26:27 -0400 Subject: [PATCH 09/12] Add DpctlMDLocalAccessorType --- .../core/types/kernel_api/local_accessor.py | 9 ++ .../utils/kernel_flattened_args_builder.py | 137 +++--------------- numba_dpex/experimental/models.py | 28 +++- 3 files changed, 57 insertions(+), 117 deletions(-) diff --git a/numba_dpex/core/types/kernel_api/local_accessor.py b/numba_dpex/core/types/kernel_api/local_accessor.py index 9d82ea9802..434c9f5f7d 100644 --- a/numba_dpex/core/types/kernel_api/local_accessor.py +++ b/numba_dpex/core/types/kernel_api/local_accessor.py @@ -10,6 +10,15 @@ from numba_dpex.utils import address_space as AddressSpace +class DpctlMDLocalAccessorType(Type): + """numba-dpex internal type to represent a dpctl SyclInterface type + `MDLocalAccessorTy`. + """ + + def __init__(self): + super().__init__(name="DpctlMDLocalAccessor") + + class LocalAccessorType(USMNdArray): """numba-dpex internal type to represent a Python object of :class:`numba_dpex.experimental.kernel_iface.LocalAccessor`. diff --git a/numba_dpex/core/utils/kernel_flattened_args_builder.py b/numba_dpex/core/utils/kernel_flattened_args_builder.py index dd30be38fb..cea608a285 100644 --- a/numba_dpex/core/utils/kernel_flattened_args_builder.py +++ b/numba_dpex/core/utils/kernel_flattened_args_builder.py @@ -11,12 +11,15 @@ import dpctl from llvmlite import ir as llvmir -from numba.core import types +from numba.core import cgutils, types from numba.core.cpu import CPUContext from numba_dpex import utils from numba_dpex.core.types import USMNdArray -from numba_dpex.core.types.kernel_api.local_accessor import LocalAccessorType +from numba_dpex.core.types.kernel_api.local_accessor import ( + DpctlMDLocalAccessorType, + LocalAccessorType, +) from numba_dpex.dpctl_iface._helpers import numba_type_to_dpctl_typenum @@ -120,40 +123,6 @@ def print_kernel_arg_list(self) -> None: for karg in args_list: print(f" {karg.llvm_val} of typeid {karg.typeid}") - def _allocate_local_accessor_metadata_struct(self): - """Allocates a struct into the current function to store the metadata - that should be passed to libsyclinterface to allocate a - sycl::local_accessor object. The constructor of the sycl::local_accessor - class is: local_accessor(range r). - - For this reason, the struct is allocated as: - - LOCAL_ACCESSOR_MDSTRUCT_TYPE = llvmir.LiteralStructType( - [ - llvmir.IntType(64), # Ndim (0..3] - llvmir.IntType(32), # typeid - llvmir.IntType(64), # Dim0 extent - llvmir.IntType(64), # Dim1 extent or NULL - llvmir.IntType(64), # Dim2 extent or NULL - ] - ) - """ - local_accessor_mdstruct_type = llvmir.LiteralStructType( - [ - llvmir.IntType(64), - llvmir.IntType(32), - llvmir.IntType(64), - llvmir.IntType(64), - llvmir.IntType(64), - ] - ) - - struct_ref = None - with self._builder.goto_entry_block(): - struct_ref = self._builder.alloca(typ=local_accessor_mdstruct_type) - - return struct_ref - def _build_arg(self, llvm_val, numba_type): """Returns a KernelArg to be passed to a DPCTLQueue_Submit call. @@ -250,7 +219,7 @@ def _store_val_into_struct(self, struct_ref, index, val): ) def _build_local_accessor_metadata_arg( - self, llvm_val, arg_type, data_attr_ty + self, llvm_val, arg_type: LocalAccessorType, data_attr_ty ): """Handles the special case of building the kernel argument for the data attribute of a kernel_api.LocalAccessor object. @@ -267,91 +236,27 @@ def _build_local_accessor_metadata_arg( handle proper device memory allocation. """ - kernel_data_model = self._kernel_dmm.lookup(arg_type) - host_data_model = self._context.data_model_manager.lookup(arg_type) - shape_member = kernel_data_model.get_member_fe_type("shape") - shape_member_pos = host_data_model.get_field_position("shape") - ndim = shape_member.count - - mdstruct_ref = self._allocate_local_accessor_metadata_struct() + ndim = arg_type.ndim - # Store the number of dimensions in the local accessor - self._store_val_into_struct( - mdstruct_ref, - index=0, - val=self._context.get_constant(types.int64, ndim), - ) - # Get the underlying dtype of the data (a CPointer) attribute of a - # local_accessor object - self._store_val_into_struct( - mdstruct_ref, - index=1, - val=numba_type_to_dpctl_typenum(self._context, data_attr_ty.dtype), - ) - # Extract and store the shape values from array into mdstruct - shape_attr = self._builder.gep( - llvm_val, - [ - self._context.get_constant(types.int32, 0), - self._context.get_constant(types.int32, shape_member_pos), - ], - ) - # Store the extent of the 1st dimension of the local accessor - dim0_shape_ext = self._builder.gep( - shape_attr, - [ - self._context.get_constant(types.int32, 0), - self._context.get_constant(types.int32, 0), - ], + md_proxy = cgutils.create_struct_proxy(DpctlMDLocalAccessorType())( + self._context, + self._builder, ) - self._store_val_into_struct( - mdstruct_ref, - index=2, - val=self._builder.load(dim0_shape_ext), + la_proxy = cgutils.create_struct_proxy(arg_type)( + self._context, self._builder, value=self._builder.load(llvm_val) ) - if ndim == 2: - dim1_shape_ext = self._builder.gep( - shape_attr, - [ - self._context.get_constant(types.int32, 0), - self._context.get_constant(types.int32, 1), - ], - ) - self._store_val_into_struct( - mdstruct_ref, - index=3, - val=self._builder.load(dim1_shape_ext), - ) - else: - self._store_val_into_struct( - mdstruct_ref, - index=3, - val=self._context.get_constant(types.int64, 1), - ) - - if ndim == 3: - dim2_shape_ext = self._builder.gep( - shape_attr, - [ - self._context.get_constant(types.int32, 0), - self._context.get_constant(types.int32, 2), - ], - ) - self._store_val_into_struct( - mdstruct_ref, - index=4, - val=self._builder.load(dim2_shape_ext), - ) - else: - self._store_val_into_struct( - mdstruct_ref, - index=4, - val=self._context.get_constant(types.int64, 1), - ) + md_proxy.ndim = self._context.get_constant(types.int64, ndim) + md_proxy.dpctl_type_id = numba_type_to_dpctl_typenum( + self._context, data_attr_ty.dtype + ) + for i, val in enumerate( + cgutils.unpack_tuple(self._builder, la_proxy.shape) + ): + setattr(md_proxy, f"dim{i}", val) return self._build_arg( - llvm_val=mdstruct_ref, + llvm_val=md_proxy._getpointer(), numba_type=LocalAccessorType( ndim, dpctl.tensor.dtype(data_attr_ty.dtype.name) ), diff --git a/numba_dpex/experimental/models.py b/numba_dpex/experimental/models.py index 48119d322a..37c861f6e8 100644 --- a/numba_dpex/experimental/models.py +++ b/numba_dpex/experimental/models.py @@ -19,7 +19,10 @@ ) from ..core.types.kernel_api.atomic_ref import AtomicRefType -from ..core.types.kernel_api.local_accessor import LocalAccessorType +from ..core.types.kernel_api.local_accessor import ( + DpctlMDLocalAccessorType, + LocalAccessorType, +) from .types import KernelDispatcherType @@ -45,6 +48,26 @@ def __init__(self, dmm, fe_type): super().__init__(dmm, fe_type, members) +class DpctlMDLocalAccessorModel(StructModel): + """Data model to represent DpctlMDLocalAccessorType. + + Must be the same structure as + dpctl/syclinterface/dpctl_sycl_queue_interface.h::MDLocalAccessor. + + Structure intended to be used only on host side of the kernel call. + """ + + def __init__(self, dmm, fe_type): + members = [ + ("ndim", types.size_t), + ("dpctl_type_id", types.int32), + ("dim0", types.size_t), + ("dim1", types.size_t), + ("dim2", types.size_t), + ] + super().__init__(dmm, fe_type, members) + + def _init_exp_data_model_manager() -> DataModelManager: """Initializes a DpexExpKernelTarget-specific data model manager. @@ -89,6 +112,9 @@ def _init_exp_data_model_manager() -> DataModelManager: # Register the NdItemType type register_model(NdItemType)(EmptyStructModel) +# Register the MDLocalAccessorType type +register_model(DpctlMDLocalAccessorType)(DpctlMDLocalAccessorModel) + # The LocalAccessorType is registered with the EmptyStructModel in the default # data manager so that its attributes are not accessible inside dpjit. register_model(LocalAccessorType)(ArrayModel) From c086ae63be1ae369f438a0ede884214caf4398ae Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Tue, 19 Mar 2024 11:27:29 -0400 Subject: [PATCH 10/12] Update LocalAccessor host model that contains only shape --- .../core/types/kernel_api/local_accessor.py | 21 ++-- .../utils/kernel_flattened_args_builder.py | 102 +++++++++++++++--- numba_dpex/experimental/models.py | 22 +++- numba_dpex/experimental/typeof.py | 2 +- numba_dpex/kernel_api/local_accessor.py | 12 +-- 5 files changed, 125 insertions(+), 34 deletions(-) diff --git a/numba_dpex/core/types/kernel_api/local_accessor.py b/numba_dpex/core/types/kernel_api/local_accessor.py index 434c9f5f7d..a34ce73b8f 100644 --- a/numba_dpex/core/types/kernel_api/local_accessor.py +++ b/numba_dpex/core/types/kernel_api/local_accessor.py @@ -2,8 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from numba.core.pythonapi import unbox -from numba.core.types import Array, Type +from numba.core import cgutils +from numba.core.types import Type, UniTuple, intp +from numba.extending import NativeValue, unbox from numba.np import numpy_support from numba_dpex.core.types import USMNdArray @@ -69,7 +70,15 @@ def unbox_local_accessor(typ, obj, c): # pylint: disable=unused-argument generate the kernel signature passing in a pointer in the local address space. """ - - nparrobj = c.pyapi.object_getattr_string(obj, "_data") - nparrtype = Array(typ.dtype, typ.ndim, typ.layout, readonly=False) - return c.unbox(nparrtype, nparrobj) + shape = c.pyapi.object_getattr_string(obj, "_shape") + local_accessor = cgutils.create_struct_proxy(typ)(c.context, c.builder) + + ty_unituple = UniTuple(intp, typ.ndim) + ll_shape = c.unbox(ty_unituple, shape) + local_accessor.shape = ll_shape.value + + return NativeValue( + c.builder.load(local_accessor._getpointer()), + is_error=ll_shape.is_error, + cleanup=ll_shape.cleanup, + ) diff --git a/numba_dpex/core/utils/kernel_flattened_args_builder.py b/numba_dpex/core/utils/kernel_flattened_args_builder.py index cea608a285..c00ca17d44 100644 --- a/numba_dpex/core/utils/kernel_flattened_args_builder.py +++ b/numba_dpex/core/utils/kernel_flattened_args_builder.py @@ -7,6 +7,8 @@ object. """ +from functools import reduce +from math import ceil from typing import NamedTuple import dpctl @@ -76,7 +78,13 @@ def add_argument( arg_packed_llvm_val, ): """Add flattened representation of a kernel argument.""" - if isinstance(arg_type, USMNdArray): + if isinstance(arg_type, LocalAccessorType): + self._kernel_arg_list.extend( + self._build_local_accessor_arg( + arg_type, llvm_val=arg_packed_llvm_val + ) + ) + elif isinstance(arg_type, USMNdArray): self._kernel_arg_list.extend( self._build_array_arg( arg_type, llvm_array_val=arg_packed_llvm_val @@ -262,6 +270,77 @@ def _build_local_accessor_metadata_arg( ), ) + def _build_local_accessor_arg(self, arg_type: LocalAccessorType, llvm_val): + """Creates a list of kernel LLVM Values for an unpacked USMNdArray + kernel argument from the local accessor. + + Method generates UsmNdArray fields from local accessor type and value. + """ + # TODO: move extra values build on device side of codegen. + ndim = arg_type.ndim + la_proxy = cgutils.create_struct_proxy(arg_type)( + self._context, self._builder, value=self._builder.load(llvm_val) + ) + shape = cgutils.unpack_tuple(self._builder, la_proxy.shape) + ll_size = reduce(self._builder.mul, shape) + + size_ptr = cgutils.alloca_once_value(self._builder, ll_size) + itemsize = self._context.get_constant( + types.intp, ceil(arg_type.dtype.bitwidth / types.byte.bitwidth) + ) + itemsize_ptr = cgutils.alloca_once_value(self._builder, itemsize) + + kernel_arg_list = [] + + kernel_dm = self._kernel_dmm.lookup(arg_type) + + kernel_arg_list.extend( + self._build_arg( + llvm_val=size_ptr, + numba_type=kernel_dm.get_member_fe_type("nitems"), + ) + ) + + # Argument itemsize + kernel_arg_list.extend( + self._build_arg( + llvm_val=itemsize_ptr, + numba_type=kernel_dm.get_member_fe_type("itemsize"), + ) + ) + + # Argument data + data_attr_ty = kernel_dm.get_member_fe_type("data") + + kernel_arg_list.extend( + self._build_local_accessor_metadata_arg( + llvm_val=llvm_val, + arg_type=arg_type, + data_attr_ty=data_attr_ty, + ) + ) + + # Arguments for shape + for val in shape: + shape_ptr = cgutils.alloca_once_value(self._builder, val) + kernel_arg_list.extend( + self._build_arg( + llvm_val=shape_ptr, + numba_type=types.int64, + ) + ) + + # Arguments for strides + for i in range(ndim): + kernel_arg_list.extend( + self._build_arg( + llvm_val=itemsize_ptr, + numba_type=types.int64, + ) + ) + + return kernel_arg_list + def _build_array_arg(self, arg_type, llvm_array_val): """Creates a list of LLVM Values for an unpacked USMNdArray kernel argument. @@ -290,22 +369,13 @@ def _build_array_arg(self, arg_type, llvm_array_val): data_attr_pos = host_data_model.get_field_position("data") data_attr_ty = kernel_data_model.get_member_fe_type("data") - if isinstance(arg_type, LocalAccessorType): - kernel_arg_list.extend( - self._build_local_accessor_metadata_arg( - llvm_val=llvm_array_val, - arg_type=arg_type, - data_attr_ty=data_attr_ty, - ) - ) - else: - kernel_arg_list.extend( - self._build_collections_attr_arg( - llvm_val=llvm_array_val, - attr_index=data_attr_pos, - attr_type=data_attr_ty, - ) + kernel_arg_list.extend( + self._build_collections_attr_arg( + llvm_val=llvm_array_val, + attr_index=data_attr_pos, + attr_type=data_attr_ty, ) + ) # Arguments for shape kernel_arg_list.extend( self._build_unituple_member_arg( diff --git a/numba_dpex/experimental/models.py b/numba_dpex/experimental/models.py index 37c861f6e8..b0c92e3083 100644 --- a/numba_dpex/experimental/models.py +++ b/numba_dpex/experimental/models.py @@ -8,10 +8,11 @@ from numba.core import types from numba.core.datamodel import DataModelManager, models -from numba.core.datamodel.models import ArrayModel, StructModel +from numba.core.datamodel.models import StructModel from numba.core.extending import register_model import numba_dpex.core.datamodel.models as dpex_core_models +from numba_dpex.core.datamodel.models import USMArrayDeviceModel from numba_dpex.core.types.kernel_api.index_space_ids import ( GroupType, ItemType, @@ -68,6 +69,17 @@ def __init__(self, dmm, fe_type): super().__init__(dmm, fe_type, members) +class LocalAccessorModel(StructModel): + """Data model for the LocalAccessor type when used in a host-only function.""" + + def __init__(self, dmm, fe_type): + ndim = fe_type.ndim + members = [ + ("shape", types.UniTuple(types.intp, ndim)), + ] + super().__init__(dmm, fe_type, members) + + def _init_exp_data_model_manager() -> DataModelManager: """Initializes a DpexExpKernelTarget-specific data model manager. @@ -84,7 +96,8 @@ def _init_exp_data_model_manager() -> DataModelManager: # Register the types and data model in the DpexExpTargetContext dmm.register(AtomicRefType, AtomicRefModel) - dmm.register(LocalAccessorType, dpex_core_models.USMArrayDeviceModel) + # Register the LocalAccessorType type + dmm.register(LocalAccessorType, USMArrayDeviceModel) # Register the GroupType type dmm.register(GroupType, EmptyStructModel) @@ -115,6 +128,5 @@ def _init_exp_data_model_manager() -> DataModelManager: # Register the MDLocalAccessorType type register_model(DpctlMDLocalAccessorType)(DpctlMDLocalAccessorModel) -# The LocalAccessorType is registered with the EmptyStructModel in the default -# data manager so that its attributes are not accessible inside dpjit. -register_model(LocalAccessorType)(ArrayModel) +# Register the LocalAccessorType type +register_model(LocalAccessorType)(LocalAccessorModel) diff --git a/numba_dpex/experimental/typeof.py b/numba_dpex/experimental/typeof.py index dcdcd1deb9..745a861ce9 100644 --- a/numba_dpex/experimental/typeof.py +++ b/numba_dpex/experimental/typeof.py @@ -97,4 +97,4 @@ def typeof_local_accessor(val: LocalAccessor, c) -> LocalAccessorType: Returns: LocalAccessorType object corresponding to the LocalAccessor object. """ # pylint: disable=protected-access - return LocalAccessorType(ndim=val._data.ndim, dtype=val._data.dtype) + return LocalAccessorType(ndim=len(val._shape), dtype=val._dtype) diff --git a/numba_dpex/kernel_api/local_accessor.py b/numba_dpex/kernel_api/local_accessor.py index 7bd3f7fb64..220ef884d7 100644 --- a/numba_dpex/kernel_api/local_accessor.py +++ b/numba_dpex/kernel_api/local_accessor.py @@ -35,19 +35,17 @@ def __init__(self, shape, dtype) -> None: if hasattr(shape, "tolist"): fn = getattr(shape, "tolist") if callable(fn): - self._shape = shape.tolist() + self._shape = tuple(shape.tolist()) else: try: - self._shape = [ - shape, - ] + self._shape = (shape,) except Exception as e: raise TypeError( "Argument shape must a non-negative integer, " "or a list/tuple of such integers." ) from e else: - self._shape = list(shape) + self._shape = tuple(shape) # Make sure shape is made up a supported types if not self._verify_positive_integral_list(self._shape): @@ -118,7 +116,9 @@ class is designed in a way to not have any data container backing up the """ def __init__(self, local_accessor: LocalAccessor): - self._data = local_accessor._data + self._data = numpy.empty( + local_accessor._shape, dtype=local_accessor._dtype + ) def __getitem__(self, idx_obj): """Returns the value stored at the position represented by idx_obj in From 8cad6148236f752fc4dcad85c90b83b534026cda Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Tue, 19 Mar 2024 12:45:42 -0400 Subject: [PATCH 11/12] Add local accessor multidimentional tests --- .../spv_overloads/test_local_accessors.py | 60 +++++++++++++++++-- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py b/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py index 1ce33c63f5..031222ac86 100644 --- a/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py +++ b/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py @@ -23,9 +23,49 @@ @dpex_exp.kernel -def _kernel(nd_item: NdItem, a, slm): +def _kernel1(nd_item: NdItem, a, slm): i = nd_item.get_global_linear_id() - j = nd_item.get_local_linear_id() + + # TODO: overload nd_item.get_local_id() + j = (nd_item.get_local_id(0),) + + slm[j] = 0 + group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) + + for m in range(100): + slm[j] += i * m + group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) + + a[i] = slm[j] + + +@dpex_exp.kernel +def _kernel2(nd_item: NdItem, a, slm): + i = nd_item.get_global_linear_id() + + # TODO: overload nd_item.get_local_id() + j = (nd_item.get_local_id(0), nd_item.get_local_id(1)) + + slm[j] = 0 + group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) + + for m in range(100): + slm[j] += i * m + group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) + + a[i] = slm[j] + + +@dpex_exp.kernel +def _kernel3(nd_item: NdItem, a, slm): + i = nd_item.get_global_linear_id() + + # TODO: overload nd_item.get_local_id() + j = ( + nd_item.get_local_id(0), + nd_item.get_local_id(1), + nd_item.get_local_id(2), + ) slm[j] = 0 group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) @@ -38,19 +78,27 @@ def _kernel(nd_item: NdItem, a, slm): @pytest.mark.parametrize("supported_dtype", list_of_supported_dtypes) -def test_local_accessor(supported_dtype): +@pytest.mark.parametrize( + "nd_range, _kernel", + [ + (dpex.NdRange((32,), (32,)), _kernel1), + (dpex.NdRange((32, 1), (32, 1)), _kernel2), + (dpex.NdRange((1, 32, 1), (1, 32, 1)), _kernel3), + ], +) +def test_local_accessor(supported_dtype, nd_range: dpex.NdRange, _kernel): """A test for passing a LocalAccessor object as a kernel argument.""" N = 32 a = dpnp.empty(N, dtype=supported_dtype) - slm = LocalAccessor((32 * 64), dtype=a.dtype) + slm = LocalAccessor(nd_range.local_range, dtype=a.dtype) # A single work group with 32 work items is launched. Each work item # computes the sum of (0..99) * its get_global_linear_id i.e., # `4950 * get_global_linear_id` and stores it into the work groups local # memory. The local memory is of size 32*64 elements of the requested dtype. # The result is then stored into `a` in global memory - dpex_exp.call_kernel(_kernel, dpex.NdRange((N,), (32,)), a, slm) + dpex_exp.call_kernel(_kernel, nd_range, a, slm) for idx in range(N): assert a[idx] == 4950 * idx @@ -68,4 +116,4 @@ def test_local_accessor_argument_to_range_kernel(): # A TypeError is raised if NUMBA_CAPTURED_ERROR=new_style and a # numba.TypingError is raised if NUMBA_CAPTURED_ERROR=old_style with pytest.raises((TypeError, TypingError)): - dpex_exp.call_kernel(_kernel, dpex.Range(N), a, slm) + dpex_exp.call_kernel(_kernel1, dpex.Range(N), a, slm) From 2200c3c4fb2f6f0439cce95901e688e2131cbc0f Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Tue, 19 Mar 2024 13:04:29 -0400 Subject: [PATCH 12/12] Add local accessor device func and python simulator tests --- .../spv_overloads/test_local_accessors.py | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py b/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py index 031222ac86..d8ae378908 100644 --- a/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py +++ b/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_local_accessors.py @@ -9,12 +9,8 @@ import numba_dpex as dpex import numba_dpex.experimental as dpex_exp -from numba_dpex.kernel_api import ( - LocalAccessor, - MemoryScope, - NdItem, - group_barrier, -) +from numba_dpex.kernel_api import LocalAccessor, NdItem +from numba_dpex.kernel_api import call_kernel as kapi_call_kernel from numba_dpex.tests._helper import get_all_dtypes list_of_supported_dtypes = get_all_dtypes( @@ -22,7 +18,6 @@ ) -@dpex_exp.kernel def _kernel1(nd_item: NdItem, a, slm): i = nd_item.get_global_linear_id() @@ -30,16 +25,13 @@ def _kernel1(nd_item: NdItem, a, slm): j = (nd_item.get_local_id(0),) slm[j] = 0 - group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) for m in range(100): slm[j] += i * m - group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) a[i] = slm[j] -@dpex_exp.kernel def _kernel2(nd_item: NdItem, a, slm): i = nd_item.get_global_linear_id() @@ -47,16 +39,13 @@ def _kernel2(nd_item: NdItem, a, slm): j = (nd_item.get_local_id(0), nd_item.get_local_id(1)) slm[j] = 0 - group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) for m in range(100): slm[j] += i * m - group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) a[i] = slm[j] -@dpex_exp.kernel def _kernel3(nd_item: NdItem, a, slm): i = nd_item.get_global_linear_id() @@ -68,15 +57,23 @@ def _kernel3(nd_item: NdItem, a, slm): ) slm[j] = 0 - group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) for m in range(100): slm[j] += i * m - group_barrier(nd_item.get_group(), MemoryScope.WORK_GROUP) a[i] = slm[j] +def device_func_kernel(func): + _df = dpex_exp.device_func(func) + + @dpex_exp.kernel + def _kernel(item, a, slm): + _df(item, a, slm) + + return _kernel + + @pytest.mark.parametrize("supported_dtype", list_of_supported_dtypes) @pytest.mark.parametrize( "nd_range, _kernel", @@ -86,7 +83,17 @@ def _kernel3(nd_item: NdItem, a, slm): (dpex.NdRange((1, 32, 1), (1, 32, 1)), _kernel3), ], ) -def test_local_accessor(supported_dtype, nd_range: dpex.NdRange, _kernel): +@pytest.mark.parametrize( + "call_kernel, kernel", + [ + (dpex_exp.call_kernel, dpex_exp.kernel), + (dpex_exp.call_kernel, device_func_kernel), + (kapi_call_kernel, lambda f: f), + ], +) +def test_local_accessor( + supported_dtype, nd_range: dpex.NdRange, _kernel, call_kernel, kernel +): """A test for passing a LocalAccessor object as a kernel argument.""" N = 32 @@ -98,7 +105,7 @@ def test_local_accessor(supported_dtype, nd_range: dpex.NdRange, _kernel): # `4950 * get_global_linear_id` and stores it into the work groups local # memory. The local memory is of size 32*64 elements of the requested dtype. # The result is then stored into `a` in global memory - dpex_exp.call_kernel(_kernel, nd_range, a, slm) + call_kernel(kernel(_kernel), nd_range, a, slm) for idx in range(N): assert a[idx] == 4950 * idx