Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of group barrier operation #1280

Merged
merged 1 commit into from
Feb 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions numba_dpex/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from ._kernel_dpcpp_spirv_overloads import (
_atomic_ref_overloads,
_group_barrier_overloads,
_index_space_id_overloads,
)
from .decorators import device_func, kernel
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# SPDX-FileCopyrightText: 2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0

"""
Provides overloads for functions included in kernel_iface.barrier that
generate dpcpp SPIR-V LLVM IR intrinsic function calls.
"""
import warnings

from llvmlite import ir as llvmir
from numba.core import cgutils, types
from numba.extending import intrinsic, overload

from numba_dpex.core import itanium_mangler as ext_itanium_mangler
from numba_dpex.experimental.target import DPEX_KERNEL_EXP_TARGET_NAME
from numba_dpex.kernel_api import group_barrier
from numba_dpex.kernel_api.memory_enums import MemoryOrder, MemoryScope

from ._spv_atomic_inst_helper import get_memory_semantics_mask, get_scope

_SUPPORT_CONVERGENT = True

try:
llvmir.FunctionAttributes("convergent")
except ValueError:
warnings.warn(
"convergent attribute is supported only starting llvmlite "
+ "0.42. Not setting this attribute may result into unexpected behavior"
+ "when using group_barrier"
)
_SUPPORT_CONVERGENT = False


def _get_memory_scope(fence_scope):
if isinstance(fence_scope, types.Literal):
return get_scope(fence_scope.literal_value)
return get_scope(fence_scope.value)


@intrinsic
def _intrinsic_barrier(
ty_context, # pylint: disable=unused-argument
ty_exec_scope, # pylint: disable=unused-argument
ty_mem_scope, # pylint: disable=unused-argument
ty_spirv_mem_sem_mask, # pylint: disable=unused-argument
):
# Signature of `__spirv_control_barrier` call that is
# generated for group_barrier. It takes three arguments -
# exec_scope, memory_scope and memory_semantics_mask.
# All arguments have to be of type unsigned int32.
sig = types.void(types.uint32, types.uint32, types.uint32)

def _intrinsic_barrier_codegen(
context, builder, sig, args # pylint: disable=unused-argument
):
exec_scope_arg = builder.trunc(args[0], llvmir.IntType(32))
mem_scope_arg = builder.trunc(args[1], llvmir.IntType(32))
spirv_memory_semantics_mask_arg = builder.trunc(
args[2], llvmir.IntType(32)
)

fn_args = [
exec_scope_arg,
mem_scope_arg,
spirv_memory_semantics_mask_arg,
]

mangled_fn_name = ext_itanium_mangler.mangle_ext(
"__spirv_ControlBarrier", [types.uint32, types.uint32, types.uint32]
)

spirv_fn_arg_types = [
llvmir.IntType(32),
llvmir.IntType(32),
llvmir.IntType(32),
]

# TODO: split the function declaration from call
fn = cgutils.get_or_insert_function(
builder.module,
llvmir.FunctionType(llvmir.VoidType(), spirv_fn_arg_types),
mangled_fn_name,
)

if _SUPPORT_CONVERGENT:
fn.attributes.add("convergent")
fn.attributes.add("nounwind")
fn.calling_convention = "spir_func"

callinst = builder.call(fn, fn_args)

if _SUPPORT_CONVERGENT:
callinst.attributes.add("convergent")
callinst.attributes.add("nounwind")

return (
sig,
_intrinsic_barrier_codegen,
)


@overload(
group_barrier,
prefer_literal=True,
target=DPEX_KERNEL_EXP_TARGET_NAME,
)
def ol_group_barrier(fence_scope=MemoryScope.WORK_GROUP):
"""SPIR-V overload for
:meth:`numba_dpex.kernel_api.group_barrier`.

Generates the same LLVM IR instruction as dpcpp for the
`group_barrier` function.

Per SYCL spec, group_barrier must perform both control barrier and memory
fence operations. Hence, group_barrier requires two scopes and memory
consistency specification as three arguments.

mem_scope - scope of any memory consistency operations that are performed by
the barrier. By default, mem_scope is set to `work_group`.
exec_scope - scope that determines the set of work-items that synchronize at
barrier. Set to `work_group` for group_barrier always.
spirv_memory_semantics_mask - Based on sycl implementation.

Mask that is set to use sequential consistency memory order semantics
always.
"""

mem_scope = _get_memory_scope(fence_scope)
exec_scope = get_scope(MemoryScope.WORK_GROUP.value)
spirv_memory_semantics_mask = get_memory_semantics_mask(
MemoryOrder.SEQ_CST.value
)

def _ol_group_barrier_impl(
fence_scope=MemoryScope.WORK_GROUP,
): # pylint: disable=unused-argument
# pylint: disable=no-value-for-parameter
return _intrinsic_barrier(
exec_scope, mem_scope, spirv_memory_semantics_mask
)

return _ol_group_barrier_impl
2 changes: 2 additions & 0 deletions numba_dpex/kernel_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"""

from .atomic_ref import AtomicRef
from .barrier import group_barrier
from .index_space_ids import Item, NdItem
from .memory_enums import AddressSpace, MemoryOrder, MemoryScope
from .ranges import NdRange, Range
Expand All @@ -23,4 +24,5 @@
"Range",
"NdItem",
"Item",
"group_barrier",
]
36 changes: 36 additions & 0 deletions numba_dpex/kernel_api/barrier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# SPDX-FileCopyrightText: 2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0

"""Python functions that simulate SYCL's barrier primitives.
"""

from .memory_enums import MemoryScope


def group_barrier(fence_scope=MemoryScope.WORK_GROUP):
"""Performs a barrier operation across all work-items in a work group.

The function is modeled after the ``sycl::group_barrier`` function. It
synchronizes work within a group of work items. All the work-items
of the group must execute the barrier construct before any work-item
continues execution beyond the barrier. However, unlike
``sycl::group_barrier`` the numba_dpex function implicitly synchronizes at
the level of a work group and does not allow specifying the group as an
argument. The :func:`sub_group_barrier` function should be used if
synchronization has to be performed only across a sub-group.

The ``group_barrier`` performs mem-fence operations ensuring that memory
accesses issued before the barrier are not re-ordered with those issued
after the barrier: all work-items in group g execute a release fence prior
to synchronizing at the barrier, all work-items in group g execute an
acquire fence afterwards, and there is an implicit synchronization of these
fences as if provided by an explicit atomic operation on an atomic object.

Args:
fence_scope (optional): scope of any memory consistency
operations that are performed by the barrier.
"""

# TODO: A pure Python simulation of a group_barrier will be added later.
raise NotImplementedError
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import dpnp

import numba_dpex as dpex
import numba_dpex.experimental as dpex_exp
from numba_dpex.kernel_api import MemoryScope, NdItem, group_barrier
from numba_dpex.tests._helper import skip_windows


# TODO: https://github.com/IntelPython/numba-dpex/issues/1308
@skip_windows
def test_group_barrier():
"""A test for group_barrier function."""

@dpex_exp.kernel
def _kernel(nd_item: NdItem, a):
i = nd_item.get_global_id(0)

a[i] += 1
group_barrier(MemoryScope.DEVICE)

if i == 0:
for idx in range(1, a.size):
a[0] += a[idx]

N = 16
a = dpnp.ones(N, dtype=dpnp.int32)

dpex_exp.call_kernel(_kernel, dpex.NdRange((N,), (N,)), a)

assert a[0] == N * 2
Loading