Skip to content

Commit

Permalink
Merge pull request #1178 from IntelPython/feature/KernelDispatcher
Browse files Browse the repository at this point in the history
An experimental kernel dispatcher for numba_dpex.kernel decorator
  • Loading branch information
Diptorup Deb authored Nov 3, 2023
2 parents ac05c1a + 199183c commit ad2cde1
Show file tree
Hide file tree
Showing 21 changed files with 1,525 additions and 108 deletions.
14 changes: 12 additions & 2 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,19 @@ on:
jobs:
pre-commit:
runs-on: ubuntu-20.04
defaults:
run:
shell: bash -el {0}
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
- uses: conda-incubator/setup-miniconda@v2
with:
python-version: '3.11'
- uses: pre-commit/[email protected]
activate-environment: "coverage"
channel-priority: "disabled"
environment-file: environment/pre-commit.yml
- uses: actions/cache@v3
with:
path: ~/.cache/pre-commit
key: pre-commit-3|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
- run: pre-commit run --show-diff-on-failure --color=always --all-files
14 changes: 14 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,17 @@ repos:
args: ["-i"]
exclude: "numba_dpex/dpnp_iface"
types_or: [c++, c]
- repo: local
hooks:
- id: pylint
name: pylint
entry: pylint
files: ^numba_dpex/experimental
language: system
types: [python]
require_serial: true
args:
[
"-rn", # Only display messages
"-sn", # Don't display the score
]
25 changes: 25 additions & 0 deletions environment/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: dev
channels:
- dppy/label/dev
- numba
- intel
- conda-forge
- nodefaults
dependencies:
- libffi
- gxx_linux-64
- dpcpp_linux-64
- numba==0.58*
- dpctl
- dpnp
- dpcpp-llvm-spirv
- opencl_rt
- coverage
- pytest
- pytest-cov
- pytest-xdist
- pexpect
- scikit-build>=0.15*
- cmake>=3.26*
- pre-commit
- pylint
13 changes: 9 additions & 4 deletions numba_dpex/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,15 @@ def __getattr__(name):
"NUMBA_DPEX_DEBUGINFO", int, config.DEBUGINFO_DEFAULT
)

# Emit LLVM assembly language format(.ll)
DUMP_KERNEL_LLVM = _readenv(
"NUMBA_DPEX_DUMP_KERNEL_LLVM", int, config.DUMP_OPTIMIZED
)
# Emit LLVM IR generated for kernel decorated function
DUMP_KERNEL_LLVM = _readenv("NUMBA_DPEX_DUMP_KERNEL_LLVM", int, 0)

# Emit LLVM module generated to launch a kernel decorated function
DUMP_KERNEL_LAUNCHER = _readenv("NUMBA_DPEX_DUMP_KERNEL_LAUNCHER", int, 0)

# Enables debug printf messages inside the kernel launcher module generated for
# a kernel decorated function
DEBUG_KERNEL_LAUNCHER = _readenv("NUMBA_DPEX_DEBUG_KERNEL_LAUNCHER", int, 0)

# configs for caching
# To see the debug messages for the caching.
Expand Down
2 changes: 2 additions & 0 deletions numba_dpex/core/descriptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ def _inherit_if_not_set(flags, options, name, default=targetconfig._NotSet):
class DpexTargetOptions(CPUTargetOptions):
experimental = _option_mapping("experimental")
release_gil = _option_mapping("release_gil")
no_compile = _option_mapping("no_compile")

def finalize(self, flags, options):
super().finalize(flags, options)
_inherit_if_not_set(flags, options, "experimental", False)
_inherit_if_not_set(flags, options, "release_gil", False)
_inherit_if_not_set(flags, options, "no_compile", True)


class DpexKernelTarget(TargetDescriptor):
Expand Down
6 changes: 6 additions & 0 deletions numba_dpex/core/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ def __init__(self, kernel_name, *, usmarray_argnum_list) -> None:
f"usm_ndarray arguments {usmarray_args} were not allocated "
"on the same queue."
)
else:
self.message = (
f'Execution queue for kernel "{kernel_name}" could '
"be deduced using compute follows data programming model. The "
"kernel has no USMNdArray argument."
)
super().__init__(self.message)


Expand Down
146 changes: 90 additions & 56 deletions numba_dpex/core/parfors/parfor_lowerer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
# SPDX-License-Identifier: Apache-2.0

import copy
from collections import namedtuple

from llvmlite import ir as llvmir
from numba.core import ir, types
from numba.core import cgutils, ir, types
from numba.parfors.parfor import (
find_potential_aliases_parfor,
get_parfor_outputs,
Expand All @@ -27,6 +28,12 @@
create_reduction_remainder_kernel_for_parfor,
)

_KernelArgs = namedtuple(
"_KernelArgs",
["num_flattened_args", "arg_vals", "arg_types"],
)


# A global list of kernels to keep the objects alive indefinitely.
keep_alive_kernels = []

Expand Down Expand Up @@ -84,21 +91,7 @@ class ParforLowerImpl:
for a parfor and submits it to a queue.
"""

def _get_exec_queue(self, kernel_fn, lowerer):
"""Creates a stack variable storing the sycl queue pointer used to
launch the kernel function.
"""
self.kernel_builder = KernelLaunchIRBuilder(
lowerer.context, lowerer.builder, kernel_fn.kernel.addressof_ref()
)

# Create a local variable storing a pointer to a DPCTLSyclQueueRef
# pointer.
self.curr_queue = self.kernel_builder.get_queue(
exec_queue=kernel_fn.queue
)

def _build_kernel_arglist(self, kernel_fn, lowerer):
def _build_kernel_arglist(self, kernel_fn, lowerer, kernel_builder):
"""Creates local variables for all the arguments and the argument types
that are passes to the kernel function.
Expand All @@ -110,39 +103,43 @@ def _build_kernel_arglist(self, kernel_fn, lowerer):
AssertionError: If the LLVM IR Value for an argument defined in
Numba IR is not found.
"""
self.num_flattened_args = 0
num_flattened_args = 0

# Compute number of args to be passed to the kernel. Note that the
# actual number of kernel arguments is greater than the count of
# kernel_fn.kernel_args as arrays get flattened.
for arg_type in kernel_fn.kernel_arg_types:
if isinstance(arg_type, DpnpNdArray):
datamodel = dpex_dmm.lookup(arg_type)
self.num_flattened_args += datamodel.flattened_field_count
num_flattened_args += datamodel.flattened_field_count
elif arg_type == types.complex64 or arg_type == types.complex128:
self.num_flattened_args += 2
num_flattened_args += 2
else:
self.num_flattened_args += 1
num_flattened_args += 1

# Create LLVM values for the kernel args list and kernel arg types list
self.args_list = self.kernel_builder.allocate_kernel_arg_array(
self.num_flattened_args
)
self.args_ty_list = self.kernel_builder.allocate_kernel_arg_ty_array(
self.num_flattened_args
args_list = kernel_builder.allocate_kernel_arg_array(num_flattened_args)
args_ty_list = kernel_builder.allocate_kernel_arg_ty_array(
num_flattened_args
)
callargs_ptrs = []
for arg in kernel_fn.kernel_args:
callargs_ptrs.append(_getvar(lowerer, arg))

self.kernel_builder.populate_kernel_args_and_args_ty_arrays(
kernel_builder.populate_kernel_args_and_args_ty_arrays(
kernel_argtys=kernel_fn.kernel_arg_types,
callargs_ptrs=callargs_ptrs,
args_list=self.args_list,
args_ty_list=self.args_ty_list,
args_list=args_list,
args_ty_list=args_ty_list,
datamodel_mgr=dpex_dmm,
)

return _KernelArgs(
num_flattened_args=num_flattened_args,
arg_vals=args_list,
arg_types=args_ty_list,
)

def _submit_parfor_kernel(
self,
lowerer,
Expand All @@ -156,9 +153,11 @@ def _submit_parfor_kernel(
# Ensure that the Python arguments are kept alive for the duration of
# the kernel execution
keep_alive_kernels.append(kernel_fn.kernel)
kernel_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder)

ptr_to_queue_ref = kernel_builder.get_queue(exec_queue=kernel_fn.queue)
args = self._build_kernel_arglist(kernel_fn, lowerer, kernel_builder)

self._get_exec_queue(kernel_fn, lowerer)
self._build_kernel_arglist(kernel_fn, lowerer)
# Create a global range over which to submit the kernel based on the
# loop_ranges of the parfor
global_range = []
Expand All @@ -178,18 +177,26 @@ def _submit_parfor_kernel(

local_range = []

kernel_ref_addr = kernel_fn.kernel.addressof_ref()
kernel_ref = lowerer.builder.inttoptr(
lowerer.context.get_constant(types.uintp, kernel_ref_addr),
cgutils.voidptr_t,
)
curr_queue_ref = lowerer.builder.load(ptr_to_queue_ref)

# Submit a synchronous kernel
self.kernel_builder.submit_sync_kernel(
self.curr_queue,
self.num_flattened_args,
self.args_list,
self.args_ty_list,
global_range,
local_range,
kernel_builder.submit_sycl_kernel(
sycl_kernel_ref=kernel_ref,
sycl_queue_ref=curr_queue_ref,
total_kernel_args=args.num_flattened_args,
arg_list=args.arg_vals,
arg_ty_list=args.arg_types,
global_range=global_range,
local_range=local_range,
)

# At this point we can free the DPCTLSyclQueueRef (curr_queue)
self.kernel_builder.free_queue(sycl_queue_val=self.curr_queue)
kernel_builder.free_queue(ptr_to_sycl_queue_ref=ptr_to_queue_ref)

def _submit_reduction_main_parfor_kernel(
self,
Expand All @@ -204,9 +211,11 @@ def _submit_reduction_main_parfor_kernel(
# Ensure that the Python arguments are kept alive for the duration of
# the kernel execution
keep_alive_kernels.append(kernel_fn.kernel)
kernel_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder)

ptr_to_queue_ref = kernel_builder.get_queue(exec_queue=kernel_fn.queue)

self._get_exec_queue(kernel_fn, lowerer)
self._build_kernel_arglist(kernel_fn, lowerer)
args = self._build_kernel_arglist(kernel_fn, lowerer, kernel_builder)
# Create a global range over which to submit the kernel based on the
# loop_ranges of the parfor
global_range = []
Expand All @@ -220,16 +229,27 @@ def _submit_reduction_main_parfor_kernel(
_load_range(lowerer, reductionHelper.work_group_size)
)

kernel_ref_addr = kernel_fn.kernel.addressof_ref()
kernel_ref = lowerer.builder.inttoptr(
lowerer.context.get_constant(types.uintp, kernel_ref_addr),
cgutils.voidptr_t,
)
curr_queue_ref = lowerer.builder.load(ptr_to_queue_ref)

# Submit a synchronous kernel
self.kernel_builder.submit_sync_kernel(
self.curr_queue,
self.num_flattened_args,
self.args_list,
self.args_ty_list,
global_range,
local_range,
kernel_builder.submit_sycl_kernel(
sycl_kernel_ref=kernel_ref,
sycl_queue_ref=curr_queue_ref,
total_kernel_args=args.num_flattened_args,
arg_list=args.arg_vals,
arg_ty_list=args.arg_types,
global_range=global_range,
local_range=local_range,
)

# At this point we can free the DPCTLSyclQueueRef (curr_queue)
kernel_builder.free_queue(ptr_to_sycl_queue_ref=ptr_to_queue_ref)

def _submit_reduction_remainder_parfor_kernel(
self,
lowerer,
Expand All @@ -243,8 +263,11 @@ def _submit_reduction_remainder_parfor_kernel(
# the kernel execution
keep_alive_kernels.append(kernel_fn.kernel)

self._get_exec_queue(kernel_fn, lowerer)
self._build_kernel_arglist(kernel_fn, lowerer)
kernel_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder)

ptr_to_queue_ref = kernel_builder.get_queue(exec_queue=kernel_fn.queue)

args = self._build_kernel_arglist(kernel_fn, lowerer, kernel_builder)
# Create a global range over which to submit the kernel based on the
# loop_ranges of the parfor
global_range = []
Expand All @@ -255,16 +278,27 @@ def _submit_reduction_remainder_parfor_kernel(

local_range = []

kernel_ref_addr = kernel_fn.kernel.addressof_ref()
kernel_ref = lowerer.builder.inttoptr(
lowerer.context.get_constant(types.uintp, kernel_ref_addr),
cgutils.voidptr_t,
)
curr_queue_ref = lowerer.builder.load(ptr_to_queue_ref)

# Submit a synchronous kernel
self.kernel_builder.submit_sync_kernel(
self.curr_queue,
self.num_flattened_args,
self.args_list,
self.args_ty_list,
global_range,
local_range,
kernel_builder.submit_sycl_kernel(
sycl_kernel_ref=kernel_ref,
sycl_queue_ref=curr_queue_ref,
total_kernel_args=args.num_flattened_args,
arg_list=args.arg_vals,
arg_ty_list=args.arg_types,
global_range=global_range,
local_range=local_range,
)

# At this point we can free the DPCTLSyclQueueRef (curr_queue)
kernel_builder.free_queue(ptr_to_sycl_queue_ref=ptr_to_queue_ref)

def _reduction_codegen(
self,
parfor,
Expand Down
8 changes: 2 additions & 6 deletions numba_dpex/core/parfors/reduction_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,11 +395,7 @@ def work_group_size(self):

def copy_final_sum_to_host(self, parfor_kernel):
lowerer = self.lowerer
ir_builder = KernelLaunchIRBuilder(
lowerer.context,
lowerer.builder,
parfor_kernel.kernel.addressof_ref(),
)
ir_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder)

# Create a local variable storing a pointer to a DPCTLSyclQueueRef
# pointer.
Expand Down Expand Up @@ -447,4 +443,4 @@ def copy_final_sum_to_host(self, parfor_kernel):
sycl.dpctl_event_wait(builder, event_ref)
sycl.dpctl_event_delete(builder, event_ref)

ir_builder.free_queue(sycl_queue_val=curr_queue)
ir_builder.free_queue(ptr_to_sycl_queue_ref=curr_queue)
Loading

0 comments on commit ad2cde1

Please sign in to comment.