implementation of barrier operation and test case

IntelPython · Jan 30, 2024 · 3b8a88c · 3b8a88c
1 parent 8fdfc1b
commit 3b8a88c
Show file tree

Hide file tree

Showing 5 changed files with 207 additions and 2 deletions.
diff --git a/numba_dpex/experimental/__init__.py b/numba_dpex/experimental/__init__.py
@@ -8,7 +8,10 @@
 
 from numba.core.imputils import Registry
 
-from ._kernel_dpcpp_spirv_overloads import _atomic_ref_overloads
+from ._kernel_dpcpp_spirv_overloads import (
+    _atomic_ref_overloads,
+    _barriers_overloads,
+)
 from .decorators import device_func, kernel
 from .kernel_dispatcher import KernelDispatcher
 from .launcher import call_kernel, call_kernel_async

diff --git a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_barriers_overloads.py b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_barriers_overloads.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Provides overloads for functions included in kernel_iface.barrier that
+generate dpcpp SPIR-V LLVM IR intrinsic function calls.
+"""
+from llvmlite import ir as llvmir
+from numba.core import cgutils, types
+from numba.extending import intrinsic, overload
+
+from numba_dpex.core import itanium_mangler as ext_itanium_mangler
+from numba_dpex.experimental.kernel_iface import group_barrier
+from numba_dpex.experimental.kernel_iface.memory_enums import (
+    MemoryOrder,
+    MemoryScope,
+)
+from numba_dpex.experimental.target import DPEX_KERNEL_EXP_TARGET_NAME
+
+from ._spv_atomic_inst_helper import get_memory_semantics_mask, get_scope
+
+
+def _get_memory_scope(fence_scope):
+    if isinstance(fence_scope, types.Literal):
+        return get_scope(fence_scope.literal_value)
+    return get_scope(fence_scope.value)
+
+
+@intrinsic
+def _intrinsic_barrier(
+    ty_context,  # pylint: disable=unused-argument
+    ty_exec_scope,  # pylint: disable=unused-argument
+    ty_mem_scope,  # pylint: disable=unused-argument
+    ty_spirv_mem_sem_mask,  # pylint: disable=unused-argument
+):
+    # Signature of `__spirv_control_barrier` call that is
+    # generated for group_barrier. It takes three arguments -
+    # exec_scope, memory_scope and memory_semantics_mask.
+    # All arguments have to be of type unsigned int32.
+    sig = types.void(types.uint32, types.uint32, types.uint32)
+
+    def _intrinsic_barrier_codegen(
+        context, builder, sig, args  # pylint: disable=unused-argument
+    ):
+        exec_scope_arg = builder.trunc(args[0], llvmir.IntType(32))
+        mem_scope_arg = builder.trunc(args[1], llvmir.IntType(32))
+        spirv_memory_semantics_mask_arg = builder.trunc(
+            args[2], llvmir.IntType(32)
+        )
+
+        fn_args = [
+            exec_scope_arg,
+            mem_scope_arg,
+            spirv_memory_semantics_mask_arg,
+        ]
+
+        mangled_fn_name = ext_itanium_mangler.mangle_ext(
+            "__spirv_ControlBarrier", [types.uint32, types.uint32, types.uint32]
+        )
+
+        spirv_fn_arg_types = [
+            llvmir.IntType(32),
+            llvmir.IntType(32),
+            llvmir.IntType(32),
+        ]
+
+        fn = cgutils.get_or_insert_function(
+            builder.module,
+            llvmir.FunctionType(llvmir.VoidType(), spirv_fn_arg_types),
+            mangled_fn_name,
+        )
+
+        fn.attributes.add("convergent")
+        fn.attributes.add("nounwind")
+        fn.calling_convention = "spir_func"
+
+        callinst = builder.call(fn, fn_args)
+
+        callinst.attributes.add("convergent")
+        callinst.attributes.add("nounwind")
+
+    return (
+        sig,
+        _intrinsic_barrier_codegen,
+    )
+
+
+@overload(
+    group_barrier,
+    prefer_literal=True,
+    target=DPEX_KERNEL_EXP_TARGET_NAME,
+)
+def ol_group_barrier(fence_scope=MemoryScope.WORK_GROUP):
+    """SPIR-V overload for
+    :meth:`numba_dpex.experimental.kernel_iface.group_barrier`.
+
+    Generates the same LLVM IR instruction as dpcpp for the
+    `group_barrier` function.
+    """
+
+    # Per SYCL spec, group_barrier must perform both control
+    # barrier and memory fence operations. Hence,
+    # group_barrier requires two scopes and memory
+    # consistency specification as three arguments.
+    #
+    # mem_scope - scope of any memory consistency operations
+    #             that are performed by the barrier. By default,
+    #             mem_scope is set to `work_group`.
+    # exec_scope - scope that determines the set of work-items
+    #              that synchronize at barrier.
+    #              Set to `work_group` for group_barrier always.
+    # spirv_memory_semantics_mask - Based on sycl implementation,
+    # Mask that is set to use sequential consistency
+    # memory order semantics always.
+
+    mem_scope = _get_memory_scope(fence_scope)
+    exec_scope = get_scope(MemoryScope.WORK_GROUP.value)
+    spirv_memory_semantics_mask = get_memory_semantics_mask(
+        MemoryOrder.SEQ_CST.value
+    )
+
+    def _ol_group_barrier_impl(
+        fence_scope=MemoryScope.WORK_GROUP,
+    ):  # pylint: disable=unused-argument
+        # pylint: disable=no-value-for-parameter
+        return _intrinsic_barrier(
+            exec_scope, mem_scope, spirv_memory_semantics_mask
+        )
+
+    return _ol_group_barrier_impl
diff --git a/numba_dpex/experimental/kernel_iface/__init__.py b/numba_dpex/experimental/kernel_iface/__init__.py
@@ -9,6 +9,13 @@
 """
 
 from .atomic_ref import AtomicRef
+from .barrier import group_barrier
 from .memory_enums import AddressSpace, MemoryOrder, MemoryScope
 
-__all__ = ["AddressSpace", "AtomicRef", "MemoryOrder", "MemoryScope"]
+__all__ = [
+    "group_barrier",
+    "AddressSpace",
+    "AtomicRef",
+    "MemoryOrder",
+    "MemoryScope",
+]
diff --git a/numba_dpex/experimental/kernel_iface/barrier.py b/numba_dpex/experimental/kernel_iface/barrier.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Python functions that simulate SYCL's barrier primitives.
+"""
+
+from .memory_enums import MemoryScope
+
+
+def group_barrier(fence_scope=MemoryScope.WORK_GROUP):
+    """Performs a barrier operation across all work-items in a work group.
+
+    The function is modeled after the ``sycl::group_barrier`` function. It
+    synchronizes work within a group of work items. All the work-items
+    of the group must execute the barrier construct before any work-item
+    continues execution beyond the barrier. However, unlike
+    ``sycl::group_barrier`` the numba_dpex function implicitly synchronizes at
+    the level of a work group and does not allow specifying the group as an
+    argument. The :func:`sub_group_barrier` function should be used if
+    synchronization has to be performed only across a sub-group.
+
+    The ``group_barrier`` performs mem-fence operations ensuring that memory
+    accesses issued before the barrier are not re-ordered with those issued
+    after the barrier: all work-items in group g execute a release fence prior
+    to synchronizing at the barrier, all work-items in group g execute an
+    acquire fence afterwards, and there is an implicit synchronization of these
+    fences as if provided by an explicit atomic operation on an atomic object.
+
+    Args:
+        fence_scope (optional): scope of any memory consistency
+        operations that are performed by the barrier.
+    """
+
+    # TODO: A pure Python simulation of a group_barrier will be added later.
+    raise NotImplementedError
diff --git a/numba_dpex/tests/experimental/kernel_iface/spv_overloads/test_barriers.py b/numba_dpex/tests/experimental/kernel_iface/spv_overloads/test_barriers.py
@@ -0,0 +1,28 @@
+import dpnp
+
+import numba_dpex as dpex
+import numba_dpex.experimental as dpex_exp
+from numba_dpex.experimental.kernel_iface import group_barrier
+
+
+def test_group_barrier():
+    """A test for group_barrier function."""
+
+    @dpex_exp.kernel
+    def _kernel(a, N):
+        i = dpex.get_global_id(0)
+
+        a[i] += 1
+        group_barrier()
+
+        if i == 0:
+            for idx in range(1, N):
+                a[0] += a[idx]
+
+    N = 8196
+    a = dpnp.ones(N, dtype=dpnp.int32)
+    b = dpnp.ones(N, dtype=dpnp.int32)
+
+    dpex_exp.call_kernel(_kernel, dpex.Range(N), a, N)
+
+    assert a[0] == N * 2