implementation of barrier operations + test cases

IntelPython · Jan 12, 2024 · 8774cd4 · 8774cd4
1 parent 3bca91c
commit 8774cd4
Show file tree

Hide file tree

Showing 5 changed files with 224 additions and 2 deletions.
diff --git a/numba_dpex/experimental/__init__.py b/numba_dpex/experimental/__init__.py
@@ -8,7 +8,10 @@
 
 from numba.core.imputils import Registry
 
-from ._kernel_dpcpp_spirv_overloads import _atomic_ref_overloads
+from ._kernel_dpcpp_spirv_overloads import (
+    _atomic_ref_overloads,
+    _barriers_overloads,
+)
 from .decorators import device_func, kernel
 from .kernel_dispatcher import KernelDispatcher
 from .launcher import call_kernel, call_kernel_async

diff --git a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_barriers_overloads.py b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_barriers_overloads.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Provides overloads for functions included in kernel_iface.barrier that
+generate dpcpp SPIR-V LLVM IR intrinsic function calls.
+"""
+from llvmlite import ir as llvmir
+from numba.core import cgutils, types
+from numba.extending import intrinsic, overload
+
+from numba_dpex.core import itanium_mangler as ext_itanium_mangler
+from numba_dpex.experimental.kernel_iface import (
+    group_barrier,
+    sub_group_barrier,
+)
+from numba_dpex.experimental.kernel_iface.memory_enums import (
+    MemoryOrder,
+    MemoryScope,
+)
+from numba_dpex.experimental.target import DPEX_KERNEL_EXP_TARGET_NAME
+
+from ._spv_atomic_inst_helper import get_memory_semantics_mask, get_scope
+
+
+def _get_memory_scope(fence_scope):
+    if isinstance(fence_scope, types.Literal):
+        return get_scope(fence_scope.literal_value)
+    return get_scope(fence_scope.value)
+
+
+@intrinsic
+def _intrinsic_barrier(
+    ty_context,  # pylint: disable=unused-argument
+    ty_exec_scope,  # pylint: disable=unused-argument
+    ty_mem_scope,  # pylint: disable=unused-argument
+    ty_spirv_mem_sem_mask,  # pylint: disable=unused-argument
+):
+    sig = types.void(types.uint32, types.uint32, types.uint32)
+
+    def _intrinsic_barrier_codegen(
+        context, builder, sig, args
+    ):  # pylint: disable=unused-argument
+        fn_name = "__spirv_ControlBarrier"
+        mangled_fn_name = ext_itanium_mangler.mangle_ext(
+            fn_name, [types.uint32, types.uint32, types.uint32]
+        )
+
+        spirv_fn_arg_types = [
+            llvmir.IntType(32),
+            llvmir.IntType(32),
+            llvmir.IntType(32),
+        ]
+
+        fnty = llvmir.FunctionType(llvmir.VoidType(), spirv_fn_arg_types)
+
+        exec_scope_arg = builder.trunc(args[0], llvmir.IntType(32))
+        mem_scope_arg = builder.trunc(args[1], llvmir.IntType(32))
+        spirv_memory_semantics_mask_arg = builder.trunc(
+            args[2], llvmir.IntType(32)
+        )
+
+        fn_args = [
+            exec_scope_arg,
+            mem_scope_arg,
+            spirv_memory_semantics_mask_arg,
+        ]
+
+        fn = cgutils.get_or_insert_function(
+            builder.module, fnty, mangled_fn_name
+        )
+
+        fn.attributes.add("convergent")
+        fn.attributes.add("nounwind")
+        fn.calling_convention = "spir_func"
+
+        callinst = builder.call(fn, fn_args)
+
+        callinst.attributes.add("convergent")
+        callinst.attributes.add("nounwind")
+
+    return (
+        sig,
+        _intrinsic_barrier_codegen,
+    )
+
+
+@overload(
+    group_barrier,
+    prefer_literal=True,
+    target=DPEX_KERNEL_EXP_TARGET_NAME,
+)
+def _ol_group_barrier(fence_scope=MemoryScope.WORK_GROUP):
+    spirv_memory_semantics_mask = get_memory_semantics_mask(
+        MemoryOrder.SEQ_CST.value
+    )
+    exec_scope = get_scope(MemoryScope.WORK_GROUP.value)
+    mem_scope = _get_memory_scope(fence_scope)
+
+    def _ol_group_barrier_impl(
+        fence_scope=MemoryScope.WORK_GROUP,
+    ):  # pylint: disable=unused-argument
+        # pylint: disable=no-value-for-parameter
+        return _intrinsic_barrier(
+            exec_scope, mem_scope, spirv_memory_semantics_mask
+        )
+
+    return _ol_group_barrier_impl
+
+
+@overload(
+    sub_group_barrier,
+    prefer_literal=True,
+    target=DPEX_KERNEL_EXP_TARGET_NAME,
+)
+def _ol_sub_group_barrier(fence_scope=MemoryScope.SUB_GROUP):
+    spirv_memory_semantics_mask = get_memory_semantics_mask(
+        MemoryOrder.SEQ_CST.value
+    )
+    exec_scope = get_scope(MemoryScope.SUB_GROUP.value)
+    mem_scope = _get_memory_scope(fence_scope)
+
+    def _ol_sub_group_barrier_impl(
+        fence_scope=MemoryScope.SUB_GROUP,
+    ):  # pylint: disable=unused-argument
+        # pylint: disable=no-value-for-parameter
+        return _intrinsic_barrier(
+            exec_scope, mem_scope, spirv_memory_semantics_mask
+        )
+
+    return _ol_sub_group_barrier_impl
diff --git a/numba_dpex/experimental/kernel_iface/__init__.py b/numba_dpex/experimental/kernel_iface/__init__.py
@@ -9,6 +9,14 @@
 """
 
 from .atomic_ref import AtomicRef
+from .barrier import group_barrier, sub_group_barrier
 from .memory_enums import AddressSpace, MemoryOrder, MemoryScope
 
-__all__ = ["AddressSpace", "AtomicRef", "MemoryOrder", "MemoryScope"]
+__all__ = [
+    "group_barrier",
+    "sub_group_barrier",
+    "AddressSpace",
+    "AtomicRef",
+    "MemoryOrder",
+    "MemoryScope",
+]
diff --git a/numba_dpex/experimental/kernel_iface/barrier.py b/numba_dpex/experimental/kernel_iface/barrier.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Python functions that simulate SYCL's barrier primitives.
+"""
+
+from .memory_enums import MemoryScope
+
+
+def group_barrier(fence_scope=MemoryScope.WORK_GROUP):
+    """Performs a barrier operation across all work-items in a work group.
+
+    The function is modeled after the ``sycl::group_barrier`` function. It
+    synchronizes work within a group of work items. All the work-items
+    of the group must execute the barrier construct before any work-item
+    continues execution beyond the barrier. However, unlike
+    ``sycl::group_barrier`` the numba_dpex function implicitly synchronizes at
+    the level of a work group and does not allow specifying the group as an
+    argument. The :func:`sub_group_barrier` function should be used if
+    synchronization has to be performed only across a sub-group.
+
+    The ``group_barrier`` performs mem-fence operations ensuring that memory
+    accesses issued before the barrier are not re-ordered with those issued
+    after the barrier: all work-items in group g execute a release fence prior
+    to synchronizing at the barrier, all work-items in group g execute an
+    acquire fence afterwards, and there is an implicit synchronization of these
+    fences as if provided by an explicit atomic operation on an atomic object.
+
+    Args:
+        fence_scope (optional): scope of any memory consistency
+        operations that are performed by the barrier.
+    """
+
+    # TODO: A pure Python simulation of a group_barrier will be added later.
+    raise NotImplementedError
+
+
+def sub_group_barrier(fence_scope=MemoryScope.SUB_GROUP):
+    """Performs a barrier operation across all work-items in a sub-group.
+
+    Modeled after ``sycl::group_barrier`` function when invoked on a
+    sub-group. Refer :func:`group_barrier` for further details.
+
+    Args:
+        fence_scope (optional): scope of any memory consistency
+        operations that are performed by the barrier.
+    """
+
+    # TODO: A pure Python simulation of a sub_group_barrier will be added later.
+    raise NotImplementedError
diff --git a/numba_dpex/tests/experimental/kernel_iface/spv_overloads/test_barriers.py b/numba_dpex/tests/experimental/kernel_iface/spv_overloads/test_barriers.py
@@ -0,0 +1,28 @@
+import dpnp
+
+import numba_dpex as dpex
+import numba_dpex.experimental as dpex_exp
+from numba_dpex.experimental.kernel_iface import group_barrier
+
+
+def test_group_barrier():
+    """A test for group_barrier function."""
+
+    @dpex_exp.kernel
+    def _kernel(a, N):
+        i = dpex.get_global_id(0)
+
+        a[i] += 1
+        group_barrier()
+
+        if i == 0:
+            for idx in range(1, N):
+                a[0] += a[idx]
+
+    N = 8196
+    a = dpnp.ones(N, dtype=dpnp.int32)
+    b = dpnp.ones(N, dtype=dpnp.int32)
+
+    dpex_exp.call_kernel(_kernel, dpex.Range(N), a, N)
+
+    assert a[0] == N * 2