implementation of barrier operation and test case

Co-authored-by: Yevhenii Havrylko <[email protected]>
IntelPython · Feb 2, 2024 · 8ca9551 · 8ca9551
1 parent b0ea6aa
commit 8ca9551
Show file tree

Hide file tree

Showing 5 changed files with 203 additions and 0 deletions.
diff --git a/numba_dpex/experimental/__init__.py b/numba_dpex/experimental/__init__.py
@@ -13,6 +13,7 @@
 
 from ._kernel_dpcpp_spirv_overloads import (
     _atomic_ref_overloads,
+    _barriers_overloads,
     _index_space_id_overloads,
 )
 from .decorators import device_func, kernel

diff --git a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_barriers_overloads.py b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_barriers_overloads.py
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Provides overloads for functions included in kernel_iface.barrier that
+generate dpcpp SPIR-V LLVM IR intrinsic function calls.
+"""
+from llvmlite import ir as llvmir
+from numba.core import cgutils, types
+from numba.extending import intrinsic, overload
+
+from numba_dpex.core import itanium_mangler as ext_itanium_mangler
+from numba_dpex.experimental.target import DPEX_KERNEL_EXP_TARGET_NAME
+from numba_dpex.kernel_api import group_barrier
+from numba_dpex.kernel_api.memory_enums import MemoryOrder, MemoryScope
+
+from ._spv_atomic_inst_helper import get_memory_semantics_mask, get_scope
+
+_SUPPORT_CONVERGENT = True
+
+try:
+    llvmir.FunctionAttributes("convergent")
+except ValueError:
+    _SUPPORT_CONVERGENT = False
+
+
+def _get_memory_scope(fence_scope):
+    if isinstance(fence_scope, types.Literal):
+        return get_scope(fence_scope.literal_value)
+    return get_scope(fence_scope.value)
+
+
+@intrinsic
+def _intrinsic_barrier(
+    ty_context,  # pylint: disable=unused-argument
+    ty_exec_scope,  # pylint: disable=unused-argument
+    ty_mem_scope,  # pylint: disable=unused-argument
+    ty_spirv_mem_sem_mask,  # pylint: disable=unused-argument
+):
+    # Signature of `__spirv_control_barrier` call that is
+    # generated for group_barrier. It takes three arguments -
+    # exec_scope, memory_scope and memory_semantics_mask.
+    # All arguments have to be of type unsigned int32.
+    sig = types.void(types.uint32, types.uint32, types.uint32)
+
+    def _intrinsic_barrier_codegen(
+        context, builder, sig, args  # pylint: disable=unused-argument
+    ):
+        exec_scope_arg = builder.trunc(args[0], llvmir.IntType(32))
+        mem_scope_arg = builder.trunc(args[1], llvmir.IntType(32))
+        spirv_memory_semantics_mask_arg = builder.trunc(
+            args[2], llvmir.IntType(32)
+        )
+
+        fn_args = [
+            exec_scope_arg,
+            mem_scope_arg,
+            spirv_memory_semantics_mask_arg,
+        ]
+
+        mangled_fn_name = ext_itanium_mangler.mangle_ext(
+            "__spirv_ControlBarrier", [types.uint32, types.uint32, types.uint32]
+        )
+
+        spirv_fn_arg_types = [
+            llvmir.IntType(32),
+            llvmir.IntType(32),
+            llvmir.IntType(32),
+        ]
+
+        fn = cgutils.get_or_insert_function(
+            builder.module,
+            llvmir.FunctionType(llvmir.VoidType(), spirv_fn_arg_types),
+            mangled_fn_name,
+        )
+
+        if _SUPPORT_CONVERGENT:
+            fn.attributes.add("convergent")
+        fn.attributes.add("nounwind")
+        fn.calling_convention = "spir_func"
+
+        callinst = builder.call(fn, fn_args)
+
+        if _SUPPORT_CONVERGENT:
+            callinst.attributes.add("convergent")
+        callinst.attributes.add("nounwind")
+
+    return (
+        sig,
+        _intrinsic_barrier_codegen,
+    )
+
+
+@overload(
+    group_barrier,
+    prefer_literal=True,
+    target=DPEX_KERNEL_EXP_TARGET_NAME,
+)
+def ol_group_barrier(fence_scope=MemoryScope.WORK_GROUP):
+    """SPIR-V overload for
+    :meth:`numba_dpex.kernel_api.group_barrier`.
+
+    Generates the same LLVM IR instruction as dpcpp for the
+    `group_barrier` function.
+    """
+
+    # Per SYCL spec, group_barrier must perform both control
+    # barrier and memory fence operations. Hence,
+    # group_barrier requires two scopes and memory
+    # consistency specification as three arguments.
+    #
+    # mem_scope - scope of any memory consistency operations
+    #             that are performed by the barrier. By default,
+    #             mem_scope is set to `work_group`.
+    # exec_scope - scope that determines the set of work-items
+    #              that synchronize at barrier.
+    #              Set to `work_group` for group_barrier always.
+    # spirv_memory_semantics_mask - Based on sycl implementation,
+    # Mask that is set to use sequential consistency
+    # memory order semantics always.
+
+    mem_scope = _get_memory_scope(fence_scope)
+    exec_scope = get_scope(MemoryScope.WORK_GROUP.value)
+    spirv_memory_semantics_mask = get_memory_semantics_mask(
+        MemoryOrder.SEQ_CST.value
+    )
+
+    def _ol_group_barrier_impl(
+        fence_scope=MemoryScope.WORK_GROUP,
+    ):  # pylint: disable=unused-argument
+        # pylint: disable=no-value-for-parameter
+        return _intrinsic_barrier(
+            exec_scope, mem_scope, spirv_memory_semantics_mask
+        )
+
+    return _ol_group_barrier_impl
diff --git a/numba_dpex/kernel_api/__init__.py b/numba_dpex/kernel_api/__init__.py
@@ -10,6 +10,7 @@
 """
 
 from .atomic_ref import AtomicRef
+from .barrier import group_barrier
 from .index_space_ids import Item, NdItem
 from .memory_enums import AddressSpace, MemoryOrder, MemoryScope
 from .ranges import NdRange, Range
@@ -23,4 +24,5 @@
     "Range",
     "NdItem",
     "Item",
+    "group_barrier",
 ]
diff --git a/numba_dpex/kernel_api/barrier.py b/numba_dpex/kernel_api/barrier.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Python functions that simulate SYCL's barrier primitives.
+"""
+
+from .memory_enums import MemoryScope
+
+
+def group_barrier(fence_scope=MemoryScope.WORK_GROUP):
+    """Performs a barrier operation across all work-items in a work group.
+
+    The function is modeled after the ``sycl::group_barrier`` function. It
+    synchronizes work within a group of work items. All the work-items
+    of the group must execute the barrier construct before any work-item
+    continues execution beyond the barrier. However, unlike
+    ``sycl::group_barrier`` the numba_dpex function implicitly synchronizes at
+    the level of a work group and does not allow specifying the group as an
+    argument. The :func:`sub_group_barrier` function should be used if
+    synchronization has to be performed only across a sub-group.
+
+    The ``group_barrier`` performs mem-fence operations ensuring that memory
+    accesses issued before the barrier are not re-ordered with those issued
+    after the barrier: all work-items in group g execute a release fence prior
+    to synchronizing at the barrier, all work-items in group g execute an
+    acquire fence afterwards, and there is an implicit synchronization of these
+    fences as if provided by an explicit atomic operation on an atomic object.
+
+    Args:
+        fence_scope (optional): scope of any memory consistency
+        operations that are performed by the barrier.
+    """
+
+    # TODO: A pure Python simulation of a group_barrier will be added later.
+    raise NotImplementedError
diff --git a/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_barriers.py b/numba_dpex/tests/experimental/kernel_api_overloads/spv_overloads/test_barriers.py
@@ -0,0 +1,27 @@
+import dpnp
+
+import numba_dpex as dpex
+import numba_dpex.experimental as dpex_exp
+from numba_dpex.kernel_api import MemoryScope, NdItem, group_barrier
+
+
+def test_group_barrier():
+    """A test for group_barrier function."""
+
+    @dpex_exp.kernel
+    def _kernel(nd_item: NdItem, a):
+        i = nd_item.get_global_id(0)
+
+        a[i] += 1
+        group_barrier(MemoryScope.DEVICE)
+
+        if i == 0:
+            for idx in range(1, a.size):
+                a[0] += a[idx]
+
+    N = 16
+    a = dpnp.ones(N, dtype=dpnp.int32)
+
+    dpex_exp.call_kernel(_kernel, dpex.NdRange((N,), (N,)), a)
+
+    assert a[0] == N * 2