From 1afb822b5a63277c4a7a9c6ca51628be0edc1c14 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Wed, 19 Jun 2024 08:54:13 +0000 Subject: [PATCH] final clean-up, add documentation, etc --- loki/backend/cgen.py | 31 +- loki/backend/cppgen.py | 10 +- loki/backend/cudagen.py | 18 +- .../block_index_transformations.py | 87 +++-- loki/transformations/data_offload.py | 4 +- loki/transformations/single_column/base.py | 32 +- loki/transformations/single_column/scc_cuf.py | 108 +++++- .../single_column/scc_low_level.py | 310 +++++++++++++++++- .../tests/test_block_index_inject.py | 21 +- loki/transformations/transpile/fortran_c.py | 34 +- 10 files changed, 538 insertions(+), 117 deletions(-) diff --git a/loki/backend/cgen.py b/loki/backend/cgen.py index 64680871d..6234dd7a6 100644 --- a/loki/backend/cgen.py +++ b/loki/backend/cgen.py @@ -22,6 +22,9 @@ class IntrinsicTypeC: + """ + Mapping Fortran type to corresponding C type. + """ # pylint: disable=abstract-method, unused-argument def __init__(self, *args, **kwargs): @@ -45,6 +48,10 @@ def c_intrinsic_type(self, _type, *args, **kwargs): c_intrinsic_type = IntrinsicTypeC() class CCodeMapper(LokiStringifyMapper): + """ + A :class:`StringifyMapper`-derived visitor for Pymbolic expression trees that converts an + expression to a string adhering to standardized C. + """ # pylint: disable=abstract-method, unused-argument def __init__(self, c_intrinsic_type, *args, **kwargs): @@ -103,7 +110,6 @@ def map_array_subscript(self, expr, enclosing_prec, *args, **kwargs): if d: index_str += self.format('[%s]', d) return self.format('%s%s', name_str, index_str) - # else: return self.format('%s', name_str) map_string_subscript = map_array_subscript @@ -170,6 +176,9 @@ def visit_Module(self, o, **kwargs): return self.join_lines(spec, routines) def _subroutine_header(self, o, **kwargs): + """ + Helper function/header for :func:`~loki.backend.CCodegen.visit_Subroutine`. + """ # Some boilerplate imports... header = [self.format_line('#include <', name, '>') for name in self.standard_imports] # ...and imports from the spec @@ -178,6 +187,9 @@ def _subroutine_header(self, o, **kwargs): return header def _subroutine_arguments(self, o, **kwargs): + """ + Helper function/routine arguments for :func:`~loki.backend.CCodegen.visit_Subroutine`. + """ var_keywords = [] pass_by = [] for a in o.arguments: @@ -193,6 +205,9 @@ def _subroutine_arguments(self, o, **kwargs): return pass_by, var_keywords def _subroutine_declaration(self, o, **kwargs): + """ + Helper function/function declaration part for :func:`~loki.backend.CCodegen.visit_Subroutine`. + """ pass_by, var_keywords = self._subroutine_arguments(o, **kwargs) arguments = [f'{k}{self.visit(a.type, **kwargs)} {p}{a.name}' for a, p, k in zip(o.arguments, pass_by, var_keywords)] @@ -207,6 +222,9 @@ def _subroutine_declaration(self, o, **kwargs): return declaration def _subroutine_body(self, o, **kwargs): + """ + Helper function/body for :func:`~loki.backend.CCodegen.visit_Subroutine`. + """ self.depth += 1 # body = ['{'] @@ -226,6 +244,9 @@ def _subroutine_body(self, o, **kwargs): return body def _subroutine_footer(self, o, **kwargs): + """ + Helper function/footer for :func:`~loki.backend.CCodegen.visit_Subroutine`. + """ footer = [self.format_line('}')] return footer @@ -234,7 +255,7 @@ def visit_Subroutine(self, o, **kwargs): Format as: ...imports... - int () { + () { ...spec without imports and argument declarations... ...body... } @@ -427,6 +448,12 @@ def visit_SymbolAttributes(self, o, **kwargs): # pylint: disable=unused-argumen return self.symgen.c_intrinsic_type(o) def visit_TypeDef(self, o, **kwargs): + """ + Format type definition/struct as + struct { + ...declarations... + }; + """ header = self.format_line('struct ', o.name.lower(), ' {') footer = self.format_line('};') self.depth += 1 diff --git a/loki/backend/cppgen.py b/loki/backend/cppgen.py index 8d7838f39..739900468 100644 --- a/loki/backend/cppgen.py +++ b/loki/backend/cppgen.py @@ -13,6 +13,9 @@ class IntrinsicTypeCpp(IntrinsicTypeC): + """ + Mapping Fortran type to corresponding C++ type. + """ def c_intrinsic_type(self, _type, *args, **kwargs): if _type.dtype == BasicType.INTEGER: @@ -25,6 +28,10 @@ def c_intrinsic_type(self, _type, *args, **kwargs): class CppCodeMapper(CCodeMapper): + """ + A :class:`StringifyMapper`-derived visitor for Pymbolic expression trees that converts an + expression to a string adhering to standardized C++. + """ # pylint: disable=abstract-method, unused-argument pass @@ -47,9 +54,6 @@ def _subroutine_header(self, o, **kwargs): return header def _subroutine_arguments(self, o, **kwargs): - # opt_extern = kwargs.get('extern', False) - # if opt_extern: - # return super()._subroutine_arguments(o, **kwargs) var_keywords = [] pass_by = [] for a in o.arguments: diff --git a/loki/backend/cudagen.py b/loki/backend/cudagen.py index 600d269ab..0323a8226 100644 --- a/loki/backend/cudagen.py +++ b/loki/backend/cudagen.py @@ -4,11 +4,8 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. -# from loki.expression import Array from loki.types import DerivedType from loki.backend.cppgen import CppCodegen, CppCodeMapper, IntrinsicTypeCpp - -# from loki.tools import as_tuple from loki.ir import Import, FindNodes from loki.expression import Array @@ -16,12 +13,19 @@ class IntrinsicTypeCuda(IntrinsicTypeCpp): + """ + Mapping Fortran type to corresponding CUDA type. + """ pass cuda_intrinsic_type = IntrinsicTypeCuda() class CudaCodeMapper(CppCodeMapper): + """ + A :class:`StringifyMapper`-derived visitor for Pymbolic expression trees that converts an + expression to a string adhering to standardized CUDA. + """ # pylint: disable=abstract-method, unused-argument pass @@ -84,8 +88,6 @@ def _subroutine_declaration(self, o, **kwargs): for a, p, k in zip(o.arguments, pass_by, var_keywords)] opt_header = kwargs.get('header', False) end = ' {' if not opt_header else ';' - # check whether to return something and define function return type accordingly - ## prefix = '' if o.prefix and "global" in o.prefix[0].lower(): prefix = '__global__ ' @@ -100,22 +102,16 @@ def _subroutine_declaration(self, o, **kwargs): def _subroutine_body(self, o, **kwargs): self.depth += 1 - - # body = ['{'] # ...and generate the spec without imports and argument declarations body = [self.visit(o.spec, skip_imports=True, skip_argument_declarations=True, **kwargs)] - # Fill the body body += [self.visit(o.body, **kwargs)] - opt_extern = kwargs.get('extern', False) if opt_extern: body += [self.format_line('cudaDeviceSynchronize();')] - # if something to be returned, add 'return ' statement if o.result_name is not None: body += [self.format_line(f'return {o.result_name.lower()};')] - # Close everything off self.depth -= 1 # footer = [self.format_line('}')] diff --git a/loki/transformations/block_index_transformations.py b/loki/transformations/block_index_transformations.py index 467b71ddd..1445acdc4 100644 --- a/loki/transformations/block_index_transformations.py +++ b/loki/transformations/block_index_transformations.py @@ -422,7 +422,10 @@ def process_kernel(self, routine, targets, exclude_arrays): class LowerBlockIndexTransformation(Transformation): """ - ... + Transformation to lower the block index via appending the block index + to variable dimensions/shape. However, this only handles variable + declarations/definitions. Therefore this transformation must always be followed by + the :any:`InjectBlockIndexTransformation`. Parameters ---------- @@ -505,29 +508,78 @@ def process(self, routine, targets, role): processed_routines += (call.routine.name,) - class LowerBlockLoopTransformation(Transformation): """ - ... + Lower the block loop to calls within this loop. + + For example, the following code: + + .. code-block:: fortran + + subroutine driver(nblks, ...) + ... + integer, intent(in) :: nblks + integer :: ibl + real :: var(jlon,nlev,nblks) + + do ibl=1,nblks + call kernel2(var,...,nblks,ibl) + enddo + ... + end subroutine driver + + subroutine kernel(var, ..., nblks, ibl) + ... + real :: var(jlon,nlev,nblks) + + do jl=1,... + do jk=1,... + var(jk,jl,ibl) = ... + end do + end do + end subroutine kernel + + is transformed to: + + .. code-block:: fortran + + subroutine driver(nblks, ...) + ... + integer, intent(in) :: nblks + integer :: ibl + real :: var(jlon,nlev,nblks) + + call kernel2(var,..., nblks) + ... + end subroutine driver + + subroutine kernel(var, ..., nblks) + ... + integer :: ibl + real :: var(jlon,nlev,nblks) + + do ibl=1,nblks + do jl=1,... + do jk=1,... + var(jk,jl,ibl) = ... + end do + end do + end do + end subroutine kernel Parameters ---------- block_dim : :any:`Dimension` :any:`Dimension` object describing the variable conventions used in code to define the blocking data dimension and iteration space. - recurse_to_kernels : bool, optional - Recurse/continue with/to (nested) kernels and lower the block index for those - as well (default: `False`). """ # This trafo only operates on procedures item_filter = (ProcedureItem,) - def __init__(self, block_dim): # , recurse_to_kernels=False): # , key=None): + def __init__(self, block_dim): self.block_dim = block_dim - # self.remove_loop = True def transform_subroutine(self, routine, **kwargs): - role = kwargs['role'] targets = tuple(str(t).lower() for t in as_tuple(kwargs.get('targets', None))) if role == 'driver': @@ -541,15 +593,10 @@ def arg_to_local_var(routine, var): type=routine.variable_map[var.name].type.clone(intent=None)),) def local_var(self, call, var): - # if var.name in call.arg_map: - # print(f"arg to local var [1] {var} | {call.arg_map[var.name]}") - # self.arg_to_local_var(call.routine, call.arg_map[var.name]) - # elif var.name in call.routine.arguments: if var.name in call.routine.arguments: self.arg_to_local_var(call.routine, var) else: call.routine.variables += (var.clone(scope=call.routine),) - # (routine.variable_map[var.name].clone(scope=call.routine)) @staticmethod def remove_openmp_pragmas(routine): @@ -565,21 +612,16 @@ def generate_pragma(loop): def process_driver(self, routine, targets): # find block loops - # with pragmas_attached(routine, (ir.Loop, ir.CallStatement)): - # if True: loops = FindNodes(ir.Loop).visit(routine.body) loops = [loop for loop in loops if loop.variable == self.block_dim.index or loop.variable in self.block_dim._index_aliases] - # if True: - # loop_map = {} - # ignore_routine = [] driver_loop_map = {} to_local_var = {} processed_routines = () calls = () for loop in loops: lower_loop = False - for call in FindNodes(ir.CallStatement).visit(loop.body): #visit(routine.body): + for call in FindNodes(ir.CallStatement).visit(loop.body): if str(call.name).lower() not in targets: continue lower_loop = True @@ -590,7 +632,7 @@ def process_driver(self, routine, targets): # replace/substitute variables according to the caller-callee argument map loop_to_lower = SubstituteExpressions(call_arg_map).visit(loop_to_lower) # remove calls that are not within targets # TODO: rather a hack to remove - # "CALL TIMER%THREAD_LOG(TID, IGPC=ICEND)" + # e.g., "CALL TIMER%THREAD_LOG(TID, IGPC=ICEND)" calls_within_loop = [_call for _call in FindNodes(ir.CallStatement).visit(loop_to_lower.body) if str(_call.name).lower() not in targets] loop_to_lower = Transformer({call: None for call in calls_within_loop}).visit(loop_to_lower) @@ -630,10 +672,9 @@ def process_driver(self, routine, targets): if lower_loop: driver_loop_map[loop] = loop.body routine.body = Transformer(driver_loop_map).visit(routine.body) - for call in calls: # FindNodes(ir.CallStatement).visit(routine.body): + for call in calls: if str(call.name).lower() not in targets: continue - # self.local_var(routine, call, loop.variable) for var in to_local_var[call.routine.name]: self.local_var(call, var) # TODO: remove diff --git a/loki/transformations/data_offload.py b/loki/transformations/data_offload.py index b1d2f5ad2..b247115fc 100644 --- a/loki/transformations/data_offload.py +++ b/loki/transformations/data_offload.py @@ -833,8 +833,8 @@ def _get_symbols(self, successors): """ Get module variables/symbols (grouped by routine/successor). """ - defines_symbols = CaseInsensitiveDict() # {} # CaseInsensitiveDict() - uses_symbols = CaseInsensitiveDict() # {} # CaseInsensitiveDict() + defines_symbols = CaseInsensitiveDict() + uses_symbols = CaseInsensitiveDict() for item in successors: if not isinstance(item, ProcedureItem): continue diff --git a/loki/transformations/single_column/base.py b/loki/transformations/single_column/base.py index 113652255..42f192804 100644 --- a/loki/transformations/single_column/base.py +++ b/loki/transformations/single_column/base.py @@ -51,6 +51,7 @@ def is_elemental(routine): Check whether :any:`Subroutine` ``routine`` is an elemental routine. Need for distinguishing elemental and non-elemental function to transform those in a different way. + Parameters ---------- routine: :any:`Subroutine` @@ -61,18 +62,18 @@ def is_elemental(routine): return True return False - @staticmethod - def check_array_dimensions_in_calls(routine): - calls = FindNodes(ir.CallStatement).visit(routine.body) - for call in calls: - for arg in call.arguments: - if isinstance(arg, sym.Array): - if any(dim == sym.RangeIndex((None, None)) for dim in arg.dimensions): - return False - return True - @staticmethod def remove_dimensions(routine, calls_only=False): + """ + Remove colon notation from array dimensions within :any:`Subroutine` ``routine``. + E.g., convert two-dimensional array ``arr2d(:,:)`` to ``arr2d`` or + ``arr3d(:,:,:)`` to ``arr3d``, but NOT e.g., ``arr(1,:,:)``. + + Parameters + ---------- + routine: :any:`Subroutine` + The subroutine to check + """ if calls_only: calls = FindNodes(ir.CallStatement).visit(routine.body) for call in calls: @@ -87,7 +88,6 @@ def remove_dimensions(routine, calls_only=False): else: arguments += (arg,) call._update(arguments=arguments) - else: arrays = [var for var in FindVariables(unique=False).visit(routine.body) if isinstance(var, sym.Array)] array_map = {} @@ -99,6 +99,16 @@ def remove_dimensions(routine, calls_only=False): @staticmethod def explicit_dimensions(routine): + """ + Make dimensions of arrays explicit within :any:`Subroutine` ``routine``. + E.g., convert two-dimensional array ``arr2d`` to ``arr2d(:,:)`` or + ``arr3d`` to ``arr3d(:,:,:)``. + + Parameters + ---------- + routine: :any:`Subroutine` + The subroutine to check + """ arrays = [var for var in FindVariables(unique=False).visit(routine.body) if isinstance(var, sym.Array)] array_map = {} for array in arrays: diff --git a/loki/transformations/single_column/scc_cuf.py b/loki/transformations/single_column/scc_cuf.py index 06954450f..7636f30be 100644 --- a/loki/transformations/single_column/scc_cuf.py +++ b/loki/transformations/single_column/scc_cuf.py @@ -28,7 +28,6 @@ 'HoistTemporaryArraysPragmaOffloadTransformation', 'SccLowLevelLaunchConfiguration', 'SccLowLevelDataOffload', - ] @@ -126,9 +125,55 @@ def device_subroutine_prefix(routine, depth): routine.prefix += ("ATTRIBUTES(DEVICE)",) class SccLowLevelLaunchConfiguration(Transformation): + """ + Part of the pipeline for generating Single Column Coalesced + Low Level GPU (CUDA Fortran, CUDA C, HIP, ...) for block-indexed gridpoint/single-column + routines (responsible for the launch configuration including the chevron notation). + """ - def __init__(self, horizontal, vertical, block_dim, transformation_type='parametrise', - derived_types=None, mode="CUF"): + def __init__(self, horizontal, vertical, block_dim, transformation_type='parametrise', mode="CUF"): + """ + Part of the pipeline for generating Single Column Coalesced + Low Level GPU (CUDA Fortran, CUDA C, HIP, ...) for block-indexed gridpoint/single-column + routines responsible for the launch configuration including the chevron notation. + + .. note:: + In dependence of the transformation type ``transformation_type``, further + transformations are necessary: + + * ``transformation_type = 'parametrise'`` requires a subsequent + :any:`ParametriseTransformation` transformation with the necessary information + to parametrise (at least) the ``vertical`` `size` + * ``transformation_type = 'hoist'`` requires subsequent :any:`HoistVariablesAnalysis` + and :class:`HoistVariablesTransformation` transformations (e.g. + :any:`HoistTemporaryArraysAnalysis` for analysis and + :any:`HoistTemporaryArraysTransformationDeviceAllocatable` or + :any:`HoistTemporaryArraysPragmaOffloadTransformation` for synthesis) + + Parameters + ---------- + horizontal : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the horizontal data dimension and iteration space. + vertical : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the vertical dimension, as needed to decide array privatization. + block_dim : :any:`Dimension` + :any:`Dimension` object to define the blocking dimension + to use for hoisted column arrays if hoisting is enabled. + transformation_type : str + Kind of transformation/Handling of temporaries/local arrays + + - `parametrise`: parametrising the array dimensions to make the vertical dimension + a compile-time constant + - `hoist`: host side hoisting of (relevant) arrays + mode: str + Mode/language to target + + - `CUF` - CUDA Fortran + - `CUDA` - CUDA C + - `HIP` - HIP + """ self.horizontal = horizontal self.vertical = vertical self.block_dim = block_dim @@ -143,12 +188,6 @@ def __init__(self, horizontal, vertical, block_dim, transformation_type='paramet self.transformation_description = {'parametrise': 'parametrised array dimensions of local arrays', 'hoist': 'host side hoisted local arrays'} - if derived_types is None: - self.derived_types = () - else: - self.derived_types = [_.upper() for _ in derived_types] - self.derived_type_variables = () - def transform_subroutine(self, routine, **kwargs): item = kwargs.get('item', None) @@ -464,9 +503,58 @@ def driver_launch_configuration(self, routine, block_dim, targets=None): class SccLowLevelDataOffload(Transformation): + """ + Part of the pipeline for generating Single Column Coalesced + Low Level GPU (CUDA Fortran, CUDA C, HIP, ...) for block-indexed gridpoint/single-column + routines (responsible for the data offload). + """ def __init__(self, horizontal, vertical, block_dim, transformation_type='parametrise', derived_types=None, mode="CUF"): + """ + Part of the pipeline for generating Single Column Coalesced + Low Level GPU (CUDA Fortran, CUDA C, HIP, ...) for block-indexed gridpoint/single-column + routines responsible for the data offload.. + + .. note:: + In dependence of the transformation type ``transformation_type``, further + transformations are necessary: + + * ``transformation_type = 'parametrise'`` requires a subsequent + :any:`ParametriseTransformation` transformation with the necessary information + to parametrise (at least) the ``vertical`` `size` + * ``transformation_type = 'hoist'`` requires subsequent :any:`HoistVariablesAnalysis` + and :class:`HoistVariablesTransformation` transformations (e.g. + :any:`HoistTemporaryArraysAnalysis` for analysis and + :any:`HoistTemporaryArraysTransformationDeviceAllocatable` or + :any:`HoistTemporaryArraysPragmaOffloadTransformation` for synthesis) + + Parameters + ---------- + horizontal : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the horizontal data dimension and iteration space. + vertical : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the vertical dimension, as needed to decide array privatization. + block_dim : :any:`Dimension` + :any:`Dimension` object to define the blocking dimension + to use for hoisted column arrays if hoisting is enabled. + derived_types: tuple + Derived types that are relevant + transformation_type : str + Kind of transformation/Handling of temporaries/local arrays + + - `parametrise`: parametrising the array dimensions to make the vertical dimension + a compile-time constant + - `hoist`: host side hoisting of (relevant) arrays + mode: str + Mode/language to target + + - `CUF` - CUDA Fortran + - `CUDA` - CUDA C + - `HIP` - HIP + """ self.horizontal = horizontal self.vertical = vertical self.block_dim = block_dim @@ -709,8 +797,6 @@ def driver_device_variables(self, routine, targets=None): pragma_map[pragma] = as_tuple(copy_end_pragmas) if pragma_map: routine.body = Transformer(pragma_map).visit(routine.body) - - # return else: # Declaration routine.spec.append(ir.Comment('')) diff --git a/loki/transformations/single_column/scc_low_level.py b/loki/transformations/single_column/scc_low_level.py index e035ce6d6..0fbdb66c3 100644 --- a/loki/transformations/single_column/scc_low_level.py +++ b/loki/transformations/single_column/scc_low_level.py @@ -48,7 +48,6 @@ def inline_elemental_kernel(routine, **kwargs): class InlineTransformation(Transformation): def transform_subroutine(self, routine, **kwargs): - # inline_elemental_kernel(routine, **kwargs) role = kwargs['role'] if role == 'kernel': @@ -56,6 +55,72 @@ def transform_subroutine(self, routine, **kwargs): inline_constant_parameters(routine, external_only=True) inline_elemental_functions(routine) + +""" +The basic Single Column Coalesced low-level GPU via CUDA-Fortran (SCC-CUF). + +This tranformation will convert kernels with innermost vectorisation +along a common horizontal dimension to a GPU-friendly loop-layout via +loop inversion and local array variable demotion. The resulting kernel +remains "vector-parallel", but with the ``horizontal`` loop as the +outermost iteration dimension (as far as data dependencies +allow). This allows local temporary arrays to be demoted to scalars, +where possible. + +Kernels are specified via ``'GLOBAL'`` and the number of threads that +execute the kernel for a given call is specified via the chevron syntax. + +This :any:`Pipeline` applies the following :any:`Transformation` +classes in sequence: +1. :any:`SCCBaseTransformation` - Ensure utility variables and resolve + problematic code constructs. +2. :any:`SCCDevectorTransformation` - Remove horizontal vector loops. +3. :any:`SCCDemoteTransformation` - Demote local temporary array + variables where appropriate. +4. :any:`SCCRevectorTransformation` - Re-insert the vecotr loops outermost, + according to identified vector sections. +5. :any:`LowerBlockIndexTransformation` - Lower the block index (for + array argument definitions). +6. :any:`InjectBlockIndexTransformation` - Complete the previous step + and inject the block index for the relevant arrays. +7. :any:`LowerBlockLoopTransformation` - Lower the block loop + from driver to kernel(s). +8. :any:`SCCLowLevelLaunchConfiguration` - Create launch configuration + and related things. +9. :any:`SCCLowLevelDataOffload` - Create/handle data offload + and related things. + +Parameters +---------- +horizontal : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the horizontal data dimension and iteration space. +block_dim : :any:`Dimension` + Optional ``Dimension`` object to define the blocking dimension + to use for hoisted column arrays if hoisting is enabled. +directive : string or None + Directives flavour to use for parallelism annotations; either + ``'openacc'`` or ``None``. +trim_vector_sections : bool + Flag to trigger trimming of extracted vector sections to remove + nodes that are not assignments involving vector parallel arrays. +demote_local_arrays : bool + Flag to trigger local array demotion to scalar variables where possible +derived_types: tuple + List of relevant derived types +transformation_type : str + Kind of transformation/Handling of temporaries/local arrays + + - `parametrise`: parametrising the array dimensions to make the vertical dimension + a compile-time constant + - `hoist`: host side hoisting of (relevant) arrays +mode: str + Mode/language to target + + - `CUF` - CUDA Fortran + - `CUDA` - CUDA C + - `HIP` - HIP +""" SCCLowLevelCuf = partial( Pipeline, classes=( SCCBaseTransformation, @@ -70,6 +135,50 @@ def transform_subroutine(self, routine, **kwargs): ) ) +""" +The Single Column Coalesced low-level GPU via CUDA-Fortran (SCC-CUF) +handling temporaries via parametrisation. + +For details of the kernel and driver-side transformations, please +refer to :any:`SCCLowLevelCuf`. + +In addition, this pipeline will invoke +:any:`ParametriseTransformation` to parametrise relevant array +dimensions to allow having temporary arrays. + +Parameters +---------- +horizontal : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the horizontal data dimension and iteration space. +block_dim : :any:`Dimension` + Optional ``Dimension`` object to define the blocking dimension + to use for hoisted column arrays if hoisting is enabled. +directive : string or None + Directives flavour to use for parallelism annotations; either + ``'openacc'`` or ``None``. +trim_vector_sections : bool + Flag to trigger trimming of extracted vector sections to remove + nodes that are not assignments involving vector parallel arrays. +demote_local_arrays : bool + Flag to trigger local array demotion to scalar variables where possible +derived_types: tuple + List of relevant derived types +transformation_type : str + Kind of transformation/Handling of temporaries/local arrays + + - `parametrise`: parametrising the array dimensions to make the vertical dimension + a compile-time constant + - `hoist`: host side hoisting of (relevant) arrays +mode: str + Mode/language to target + + - `CUF` - CUDA Fortran + - `CUDA` - CUDA C + - `HIP` - HIP +dic2p: dict + Dictionary of variable names and corresponding values to be parametrised. +""" SCCLowLevelCufParametrise = partial( Pipeline, classes=( SCCBaseTransformation, @@ -85,6 +194,49 @@ def transform_subroutine(self, routine, **kwargs): ) ) +""" +The Single Column Coalesced low-level GPU via CUDA-Fortran (SCC-CUF) +handling temporaries via hoisting. + +For details of the kernel and driver-side transformations, please +refer to :any:`SCCLowLevelCuf`. + +In addition, this pipeline will invoke +:any:`HoistTemporaryArraysAnalysis` and +:any:`HoistTemporaryArraysDeviceAllocatableTransformation` +to hoist temporary arrays. + +Parameters +---------- +horizontal : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the horizontal data dimension and iteration space. +block_dim : :any:`Dimension` + Optional ``Dimension`` object to define the blocking dimension + to use for hoisted column arrays if hoisting is enabled. +directive : string or None + Directives flavour to use for parallelism annotations; either + ``'openacc'`` or ``None``. +trim_vector_sections : bool + Flag to trigger trimming of extracted vector sections to remove + nodes that are not assignments involving vector parallel arrays. +demote_local_arrays : bool + Flag to trigger local array demotion to scalar variables where possible +derived_types: tuple + List of relevant derived types +transformation_type : str + Kind of transformation/Handling of temporaries/local arrays + + - `parametrise`: parametrising the array dimensions to make the vertical dimension + a compile-time constant + - `hoist`: host side hoisting of (relevant) arrays +mode: str + Mode/language to target + + - `CUF` - CUDA Fortran + - `CUDA` - CUDA C + - `HIP` - HIP +""" SCCLowLevelCufHoist = partial( Pipeline, classes=( SCCBaseTransformation, @@ -101,6 +253,83 @@ def transform_subroutine(self, routine, **kwargs): ) ) +""" +The Single Column Coalesced low-level GPU via low-level C-style +kernel language (CUDA, HIP, ...) handling temporaries via parametrisation. + +This tranformation will convert kernels with innermost vectorisation +along a common horizontal dimension to a GPU-friendly loop-layout via +loop inversion and local array variable demotion. The resulting kernel +remains "vector-parallel", but with the ``horizontal`` loop as the +outermost iteration dimension (as far as data dependencies +allow). This allows local temporary arrays to be demoted to scalars, +where possible. + +Kernels are specified via e.g., ``'__global__'`` and the number of threads that +execute the kernel for a given call is specified via the chevron syntax. + +This :any:`Pipeline` applies the following :any:`Transformation` +classes in sequence: +1. :any:`InlineTransformation` - Inline constants and elemental + functions. +2. :any:`GlobalVariableAnalysis` - Analysis of global variables +3. :any:`GlobalVarHoistTransformation` - Hoist global variables + to the driver. +4. :any:`DerivedTypeArgumentsTransformation` - Flatten derived types/ + remove derived types from procedure signatures by replacing the + (relevant) derived type arguments by its member variables. +5. :any:`SCCBaseTransformation` - Ensure utility variables and resolve + problematic code constructs. +6. :any:`SCCDevectorTransformation` - Remove horizontal vector loops. +7. :any:`SCCDemoteTransformation` - Demote local temporary array + variables where appropriate. +8. :any:`SCCRevectorTransformation` - Re-insert the vecotr loops outermost, + according to identified vector sections. +9. :any:`LowerBlockIndexTransformation` - Lower the block index (for + array argument definitions). +10. :any:`InjectBlockIndexTransformation` - Complete the previous step + and inject the block index for the relevant arrays. +11. :any:`LowerBlockLoopTransformation` - Lower the block loop + from driver to kernel(s). +12. :any:`SCCLowLevelLaunchConfiguration` - Create launch configuration + and related things. +13. :any:`SCCLowLevelDataOffload` - Create/handle data offload + and related things. +14. :any:`ParametriseTransformation` - Parametrise according to ``dic2p``. + +Parameters +---------- +horizontal : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the horizontal data dimension and iteration space. +block_dim : :any:`Dimension` + Optional ``Dimension`` object to define the blocking dimension + to use for hoisted column arrays if hoisting is enabled. +directive : string or None + Directives flavour to use for parallelism annotations; either + ``'openacc'`` or ``None``. +trim_vector_sections : bool + Flag to trigger trimming of extracted vector sections to remove + nodes that are not assignments involving vector parallel arrays. +demote_local_arrays : bool + Flag to trigger local array demotion to scalar variables where possible +derived_types: tuple + List of relevant derived types +transformation_type : str + Kind of transformation/Handling of temporaries/local arrays + + - `parametrise`: parametrising the array dimensions to make the vertical dimension + a compile-time constant + - `hoist`: host side hoisting of (relevant) arrays +mode: str + Mode/language to target + + - `CUF` - CUDA Fortran + - `CUDA` - CUDA C + - `HIP` - HIP +dic2p: dict + Dictionary of variable names and corresponding values to be parametrised. +""" SCCLowLevelParametrise = partial( Pipeline, classes=( InlineTransformation, @@ -120,6 +349,83 @@ def transform_subroutine(self, routine, **kwargs): ) ) +""" +The Single Column Coalesced low-level GPU via low-level C-style +kernel language (CUDA, HIP, ...) handling temporaries via parametrisation. + +This tranformation will convert kernels with innermost vectorisation +along a common horizontal dimension to a GPU-friendly loop-layout via +loop inversion and local array variable demotion. The resulting kernel +remains "vector-parallel", but with the ``horizontal`` loop as the +outermost iteration dimension (as far as data dependencies +allow). This allows local temporary arrays to be demoted to scalars, +where possible. + +Kernels are specified via e.g., ``'__global__'`` and the number of threads that +execute the kernel for a given call is specified via the chevron syntax. + +This :any:`Pipeline` applies the following :any:`Transformation` +classes in sequence: +1. :any:`InlineTransformation` - Inline constants and elemental + functions. +2. :any:`GlobalVariableAnalysis` - Analysis of global variables +3. :any:`GlobalVarHoistTransformation` - Hoist global variables + to the driver. +4. :any:`DerivedTypeArgumentsTransformation` - Flatten derived types/ + remove derived types from procedure signatures by replacing the + (relevant) derived type arguments by its member variables. +5. :any:`SCCBaseTransformation` - Ensure utility variables and resolve + problematic code constructs. +6. :any:`SCCDevectorTransformation` - Remove horizontal vector loops. +7. :any:`SCCDemoteTransformation` - Demote local temporary array + variables where appropriate. +8. :any:`SCCRevectorTransformation` - Re-insert the vecotr loops outermost, + according to identified vector sections. +9. :any:`LowerBlockIndexTransformation` - Lower the block index (for + array argument definitions). +10. :any:`InjectBlockIndexTransformation` - Complete the previous step + and inject the block index for the relevant arrays. +11. :any:`LowerBlockLoopTransformation` - Lower the block loop + from driver to kernel(s). +12. :any:`SCCLowLevelLaunchConfiguration` - Create launch configuration + and related things. +13. :any:`SCCLowLevelDataOffload` - Create/handle data offload + and related things. +14. :any:`HoistTemporaryArraysAnalysis` - Analysis part of hoisting. +15. :any:`HoistTemporaryArraysPragmaOffloadTransformation` - Syntesis + part of hoisting. + +Parameters +---------- +horizontal : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the horizontal data dimension and iteration space. +block_dim : :any:`Dimension` + Optional ``Dimension`` object to define the blocking dimension + to use for hoisted column arrays if hoisting is enabled. +directive : string or None + Directives flavour to use for parallelism annotations; either + ``'openacc'`` or ``None``. +trim_vector_sections : bool + Flag to trigger trimming of extracted vector sections to remove + nodes that are not assignments involving vector parallel arrays. +demote_local_arrays : bool + Flag to trigger local array demotion to scalar variables where possible +derived_types: tuple + List of relevant derived types +transformation_type : str + Kind of transformation/Handling of temporaries/local arrays + + - `parametrise`: parametrising the array dimensions to make the vertical dimension + a compile-time constant + - `hoist`: host side hoisting of (relevant) arrays +mode: str + Mode/language to target + + - `CUF` - CUDA Fortran + - `CUDA` - CUDA C + - `HIP` - HIP +""" SCCLowLevelHoist = partial( Pipeline, classes=( InlineTransformation, @@ -135,7 +441,7 @@ def transform_subroutine(self, routine, **kwargs): LowerBlockLoopTransformation, SccLowLevelLaunchConfiguration, SccLowLevelDataOffload, - HoistTemporaryArraysAnalysis, + HoistTemporaryArraysAnalysis, HoistTemporaryArraysPragmaOffloadTransformation ) ) diff --git a/loki/transformations/tests/test_block_index_inject.py b/loki/transformations/tests/test_block_index_inject.py index 35d6d41b7..0c8f9dd49 100644 --- a/loki/transformations/tests/test_block_index_inject.py +++ b/loki/transformations/tests/test_block_index_inject.py @@ -390,8 +390,6 @@ def test_blockview_to_fieldview_exception(frontend, horizontal): targets=('compute',)) -# @pytest.mark.parametrize('frontend', available_frontends(xfail=[(OMNI, -# 'OMNI correctly complains about rank mismatch in assignment.')])) @pytest.mark.parametrize('frontend', available_frontends()) @pytest.mark.parametrize('block_dim_arg', (False, True)) @pytest.mark.parametrize('recurse_to_kernels', (False, True)) @@ -538,18 +536,7 @@ def test_simple_lower_loop(blocking, frontend, block_dim_arg, recurse_to_kernels assert blocking.index not in kernel_mod['kernel'].arguments assert blocking.index in kernel_mod['kernel'].variable_map - # print(f"---------------\ndriver:\n{fgen(driver)}") - # print(f"---------------\nkernel:\n{fgen(kernel_mod['kernel'])}") - # print(f"---------------\nkernel:\n{fgen(nested_kernel_mod['compute'])}") - # print("\n\n") - # print(f"kernel.symbol_table: {dict(kernel['kernel'].symbol_attrs)}") - # assigns = FindNodes(Assignment).visit(kernel.body) - # assert assigns[0].lhs == 'var(:,:,ibl)' - # calls = FindNodes(CallStatement).visit(kernel.body) - # assert 'var(:,:,ibl)' in calls[0].arguments - -# @pytest.mark.parametrize('frontend', available_frontends(xfail=[(OMNI, -# 'OMNI correctly complains about rank mismatch in assignment.')])) + @pytest.mark.parametrize('frontend', available_frontends()) @pytest.mark.parametrize('recurse_to_kernels', (False, True)) @pytest.mark.parametrize('targets', (('kernel', 'another_kernel', 'compute'), ('kernel', 'compute'))) @@ -753,9 +740,3 @@ def test_lower_loop(blocking, frontend, recurse_to_kernels, targets): assert blocking.size in kernel_mod['kernel'].arguments assert blocking.index not in kernel_mod['kernel'].arguments assert blocking.index in kernel_mod['kernel'].variable_map - - # print(f"---------------\ndriver:\n{fgen(driver)}") - # print(f"---------------\nkernel:\n{fgen(kernel_mod['kernel'])}") - # print(f"---------------\nkernel:\n{fgen(another_kernel_mod['another_kernel'])}") - # print(f"---------------\nkernel:\n{fgen(nested_kernel_mod['compute'])}") - # print("\n\n") diff --git a/loki/transformations/transpile/fortran_c.py b/loki/transformations/transpile/fortran_c.py index 210084770..811a442b3 100644 --- a/loki/transformations/transpile/fortran_c.py +++ b/loki/transformations/transpile/fortran_c.py @@ -116,15 +116,13 @@ class FortranCTransformation(Transformation): # Set of standard module names that have no C equivalent __fortran_intrinsic_modules = ['ISO_FORTRAN_ENV', 'ISO_C_BINDING'] - def __init__(self, inline_elementals=True, use_c_ptr=False, path=None, language='c'): # codegen=cgen): + def __init__(self, inline_elementals=True, use_c_ptr=False, path=None, language='c'): self.inline_elementals = inline_elementals self.use_c_ptr = use_c_ptr self.path = Path(path) if path is not None else None - # self.codegen = codegen self.language = language.lower() assert self.language in ['c', 'cuda'] # , 'hip'] - # self.langgen = cgen if self.language == 'c' else cppgen if self.language == 'c': self.codegen = cgen elif self.language == 'cuda': @@ -195,21 +193,14 @@ def transform_subroutine(self, routine, **kwargs): contains = Section(body=(Intrinsic('CONTAINS'), wrapper)) self.wrapperpath = (path/wrapper.name.lower()).with_suffix('.F90') module = Module(name=f'{wrapper.name.upper()}_MOD', contains=contains) - ### new ### module.spec = Section(body=(Import(module='iso_c_binding'),)) # Generate C source file from Loki IR c_kernel = self.generate_c_kernel(routine, targets=targets) self.c_path = (path/c_kernel.name.lower()).with_suffix('.c') - # Sourcefile.to_file(source=self.langgen(c_kernel), path=self.c_path) - ### end new ### Sourcefile.to_file(source=fgen(module), path=self.wrapperpath) # Generate C source file from Loki IR - # c_kernel = self.generate_c_kernel(routine) - # self.c_path = (path/c_kernel.name.lower()).with_suffix('.c') - # Sourcefile.to_file(source=self.codegen(c_kernel), path=self.c_path) - #### new #### # c_kernel.spec.prepend(Import(module=f'{c_kernel.name.lower()}.h', c_import=True)) for successor in successors: if self.language == 'c': @@ -219,32 +210,19 @@ def transform_subroutine(self, routine, **kwargs): if not isinstance(successor, ProcedureItem): c_kernel.spec.prepend(Import(module=f'{successor.routine.name.lower()}_c.c', c_import=True)) - # Sourcefile.to_file(source=self.langgen(c_kernel), path=self.c_path) - if depth == 1: if self.language != 'c': c_kernel_launch = c_kernel.clone(name=f"{c_kernel.name}_launch", prefix="extern_c") self.generate_c_kernel_launch(c_kernel_launch, c_kernel) self.c_path = (path/c_kernel_launch.name.lower()).with_suffix('.h') Sourcefile.to_file(source=self.codegen(c_kernel_launch, extern=True), path=self.c_path) - else: - # TODO: nested device routines ..., should work correctly? - # c_kernel_header = c_kernel.clone(name=f"{c_kernel.name}", prefix="header_only device") - pass - # c_kernel_header = c_kernel.clone(name=f"{routine.name.lower()}", prefix="header_only device") - # TODO: this shouldn't be necessary anymore usinge self.codegen(..., header=True) - # self.generate_c_kernel_header(c_kernel_header) - # self.c_path =(path/c_kernel_header.name.lower()).with_suffix('.h') - # Sourcefile.to_file(source=self.codegen(c_kernel_header), path=self.c_path) - - ## new + assignments = FindNodes(Assignment).visit(c_kernel.body) assignments2remove = ['griddim', 'blockdim'] assignment_map = {assignment: None for assignment in assignments if assignment.lhs.name.lower() in assignments2remove} # == block_dim.index.lower() or assignment.lhs.name.lower() in [_.lower() for _ in horizontal.bounds]} c_kernel.body = Transformer(assignment_map).visit(c_kernel.body) - ## end:new if depth > 1: c_kernel.spec.prepend(Import(module=f'{c_kernel.name.lower()}.h', c_import=True)) @@ -253,7 +231,6 @@ def transform_subroutine(self, routine, **kwargs): self.c_path = (path/c_kernel.name.lower()).with_suffix('.h') Sourcefile.to_file(source=self.codegen(c_kernel, header=True), path=self.c_path) self.c_path = (path/c_kernel.name.lower()).with_suffix('.c') - #### end new #### def c_struct_typedef(self, derived): """ @@ -679,13 +656,6 @@ def generate_c_kernel_launch(self, kernel_launch, kernel, **kwargs): kernel_call = kernel.clone() call_arguments = [] for arg in kernel_call.arguments: - # TODO: ? - # if False: # isinstance(arg, Array): - # # _type = arg.type.clone(pointer=False) - # # kernel_call.symbol_attrs[arg.name] = _type - # call_arguments.append(arg.clone(dimensions=None)) - # # call_arguments.append(arg.clone(dimensions=None, type=_type)) - # else: call_arguments.append(arg) griddim = None