From 1afb822b5a63277c4a7a9c6ca51628be0edc1c14 Mon Sep 17 00:00:00 2001
From: Michael Staneker <michael.staneker@ecmwf.int>
Date: Wed, 19 Jun 2024 08:54:13 +0000
Subject: [PATCH] final clean-up, add documentation, etc

---
 loki/backend/cgen.py                          |  31 +-
 loki/backend/cppgen.py                        |  10 +-
 loki/backend/cudagen.py                       |  18 +-
 .../block_index_transformations.py            |  87 +++--
 loki/transformations/data_offload.py          |   4 +-
 loki/transformations/single_column/base.py    |  32 +-
 loki/transformations/single_column/scc_cuf.py | 108 +++++-
 .../single_column/scc_low_level.py            | 310 +++++++++++++++++-
 .../tests/test_block_index_inject.py          |  21 +-
 loki/transformations/transpile/fortran_c.py   |  34 +-
 10 files changed, 538 insertions(+), 117 deletions(-)
diff --git a/loki/backend/cgen.py b/loki/backend/cgen.py
index 64680871d..6234dd7a6 100644
--- a/loki/backend/cgen.py
+++ b/loki/backend/cgen.py
@@ -22,6 +22,9 @@
 
 
 class IntrinsicTypeC:
+    """
+    Mapping Fortran type to corresponding C type.
+    """
     # pylint: disable=abstract-method, unused-argument
 
     def __init__(self, *args, **kwargs):
@@ -45,6 +48,10 @@ def c_intrinsic_type(self, _type, *args, **kwargs):
 c_intrinsic_type = IntrinsicTypeC()
 
 class CCodeMapper(LokiStringifyMapper):
+    """
+    A :class:`StringifyMapper`-derived visitor for Pymbolic expression trees that converts an
+    expression to a string adhering to standardized C.
+    """
     # pylint: disable=abstract-method, unused-argument
 
     def __init__(self, c_intrinsic_type, *args, **kwargs):
@@ -103,7 +110,6 @@ def map_array_subscript(self, expr, enclosing_prec, *args, **kwargs):
                 if d:
                     index_str += self.format('[%s]', d)
             return self.format('%s%s', name_str, index_str)
-        # else:
         return self.format('%s', name_str)
 
     map_string_subscript = map_array_subscript
@@ -170,6 +176,9 @@ def visit_Module(self, o, **kwargs):
         return self.join_lines(spec, routines)
 
     def _subroutine_header(self, o, **kwargs):
+        """
+        Helper function/header for :func:`~loki.backend.CCodegen.visit_Subroutine`.
+        """
         # Some boilerplate imports...
         header = [self.format_line('#include <', name, '>') for name in self.standard_imports]
         # ...and imports from the spec
@@ -178,6 +187,9 @@ def _subroutine_header(self, o, **kwargs):
         return header
 
     def _subroutine_arguments(self, o, **kwargs):
+        """
+        Helper function/routine arguments for :func:`~loki.backend.CCodegen.visit_Subroutine`.
+        """
         var_keywords = []
         pass_by = []
         for a in o.arguments:
@@ -193,6 +205,9 @@ def _subroutine_arguments(self, o, **kwargs):
         return pass_by, var_keywords
 
     def _subroutine_declaration(self, o, **kwargs):
+        """
+        Helper function/function declaration part for :func:`~loki.backend.CCodegen.visit_Subroutine`.
+        """
         pass_by, var_keywords = self._subroutine_arguments(o, **kwargs)
         arguments = [f'{k}{self.visit(a.type, **kwargs)} {p}{a.name}'
                      for a, p, k in zip(o.arguments, pass_by, var_keywords)]
@@ -207,6 +222,9 @@ def _subroutine_declaration(self, o, **kwargs):
         return declaration
 
     def _subroutine_body(self, o, **kwargs):
+        """
+        Helper function/body for :func:`~loki.backend.CCodegen.visit_Subroutine`.
+        """
         self.depth += 1
 
         # body = ['{']
@@ -226,6 +244,9 @@ def _subroutine_body(self, o, **kwargs):
         return body
 
     def _subroutine_footer(self, o, **kwargs):
+        """
+        Helper function/footer for :func:`~loki.backend.CCodegen.visit_Subroutine`.
+        """
         footer = [self.format_line('}')]
         return footer
 
@@ -234,7 +255,7 @@ def visit_Subroutine(self, o, **kwargs):
         Format as:
 
           ...imports...
-          int <name>(<args>) {
+          <return_type> <name>(<args>) {
             ...spec without imports and argument declarations...
             ...body...
           }
@@ -427,6 +448,12 @@ def visit_SymbolAttributes(self, o, **kwargs):  # pylint: disable=unused-argumen
         return self.symgen.c_intrinsic_type(o)
 
     def visit_TypeDef(self, o, **kwargs):
+        """
+        Format type definition/struct as
+          struct <name> {
+            ...declarations...
+          };
+        """
         header = self.format_line('struct ', o.name.lower(), ' {')
         footer = self.format_line('};')
         self.depth += 1
diff --git a/loki/backend/cppgen.py b/loki/backend/cppgen.py
index 8d7838f39..739900468 100644
--- a/loki/backend/cppgen.py
+++ b/loki/backend/cppgen.py
@@ -13,6 +13,9 @@
 
 
 class IntrinsicTypeCpp(IntrinsicTypeC):
+    """
+    Mapping Fortran type to corresponding C++ type.
+    """
 
     def c_intrinsic_type(self, _type, *args, **kwargs):
         if _type.dtype == BasicType.INTEGER:
@@ -25,6 +28,10 @@ def c_intrinsic_type(self, _type, *args, **kwargs):
 
 
 class CppCodeMapper(CCodeMapper):
+    """
+    A :class:`StringifyMapper`-derived visitor for Pymbolic expression trees that converts an
+    expression to a string adhering to standardized C++.
+    """
     # pylint: disable=abstract-method, unused-argument
     pass
 
@@ -47,9 +54,6 @@ def _subroutine_header(self, o, **kwargs):
         return header
 
     def _subroutine_arguments(self, o, **kwargs):
-        # opt_extern = kwargs.get('extern', False)
-        # if opt_extern:
-        #     return super()._subroutine_arguments(o, **kwargs)
         var_keywords = []
         pass_by = []
         for a in o.arguments:
diff --git a/loki/backend/cudagen.py b/loki/backend/cudagen.py
index 600d269ab..0323a8226 100644
--- a/loki/backend/cudagen.py
+++ b/loki/backend/cudagen.py
@@ -4,11 +4,8 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
-# from loki.expression import Array
 from loki.types import DerivedType
 from loki.backend.cppgen import CppCodegen, CppCodeMapper, IntrinsicTypeCpp
-
-# from loki.tools import as_tuple
 from loki.ir import Import, FindNodes
 from loki.expression import Array
 
@@ -16,12 +13,19 @@
 
 
 class IntrinsicTypeCuda(IntrinsicTypeCpp):
+    """
+    Mapping Fortran type to corresponding CUDA type.
+    """
     pass
 
 cuda_intrinsic_type = IntrinsicTypeCuda()
 
 
 class CudaCodeMapper(CppCodeMapper):
+    """
+    A :class:`StringifyMapper`-derived visitor for Pymbolic expression trees that converts an
+    expression to a string adhering to standardized CUDA.
+    """
     # pylint: disable=abstract-method, unused-argument
     pass
 
@@ -84,8 +88,6 @@ def _subroutine_declaration(self, o, **kwargs):
                      for a, p, k in zip(o.arguments, pass_by, var_keywords)]
         opt_header = kwargs.get('header', False)
         end = ' {' if not opt_header else ';'
-        # check whether to return something and define function return type accordingly
-        ##
         prefix = ''
         if o.prefix and "global" in o.prefix[0].lower():
             prefix = '__global__ '
@@ -100,22 +102,16 @@ def _subroutine_declaration(self, o, **kwargs):
 
     def _subroutine_body(self, o, **kwargs):
         self.depth += 1
-
-        # body = ['{']
         # ...and generate the spec without imports and argument declarations
         body = [self.visit(o.spec, skip_imports=True, skip_argument_declarations=True, **kwargs)]
-
         # Fill the body
         body += [self.visit(o.body, **kwargs)]
-
         opt_extern = kwargs.get('extern', False)
         if opt_extern:
             body += [self.format_line('cudaDeviceSynchronize();')]
-
         # if something to be returned, add 'return <var>' statement
         if o.result_name is not None:
             body += [self.format_line(f'return {o.result_name.lower()};')]
-
         # Close everything off
         self.depth -= 1
         # footer = [self.format_line('}')]
diff --git a/loki/transformations/block_index_transformations.py b/loki/transformations/block_index_transformations.py
index 467b71ddd..1445acdc4 100644
--- a/loki/transformations/block_index_transformations.py
+++ b/loki/transformations/block_index_transformations.py
@@ -422,7 +422,10 @@ def process_kernel(self, routine, targets, exclude_arrays):
 
 class LowerBlockIndexTransformation(Transformation):
     """
-    ...
+    Transformation to lower the block index via appending the block index 
+    to variable dimensions/shape. However, this only handles variable
+    declarations/definitions. Therefore this transformation must always be followed by
+    the :any:`InjectBlockIndexTransformation`.
 
     Parameters
     ----------
@@ -505,29 +508,78 @@ def process(self, routine, targets, role):
             processed_routines += (call.routine.name,)
 
 
-
 class LowerBlockLoopTransformation(Transformation):
     """
-    ...
+    Lower the block loop to calls within this loop. 
+
+    For example, the following code:
+
+    .. code-block:: fortran
+
+        subroutine driver(nblks, ...)
+            ...
+            integer, intent(in) :: nblks
+            integer :: ibl
+            real :: var(jlon,nlev,nblks)
+
+            do ibl=1,nblks
+                call kernel2(var,...,nblks,ibl)
+            enddo
+            ...
+        end subroutine driver
+
+        subroutine kernel(var, ..., nblks, ibl)
+            ...
+            real :: var(jlon,nlev,nblks)
+
+            do jl=1,...
+                do jk=1,...
+                    var(jk,jl,ibl) = ...
+                end do
+            end do
+        end subroutine kernel
+
+    is transformed to:
+    
+    .. code-block:: fortran
+
+        subroutine driver(nblks, ...)
+            ...
+            integer, intent(in) :: nblks
+            integer :: ibl
+            real :: var(jlon,nlev,nblks)
+
+            call kernel2(var,..., nblks)
+            ...
+        end subroutine driver
+
+        subroutine kernel(var, ..., nblks)
+            ...
+            integer :: ibl
+            real :: var(jlon,nlev,nblks)
+            
+            do ibl=1,nblks
+                do jl=1,...
+                    do jk=1,...
+                        var(jk,jl,ibl) = ...
+                    end do
+                end do
+            end do
+        end subroutine kernel
 
     Parameters
     ----------
     block_dim : :any:`Dimension`
         :any:`Dimension` object describing the variable conventions used in code
         to define the blocking data dimension and iteration space.
-    recurse_to_kernels : bool, optional
-        Recurse/continue with/to (nested) kernels and lower the block index for those
-        as well (default: `False`).
     """
     # This trafo only operates on procedures
     item_filter = (ProcedureItem,)
 
-    def __init__(self, block_dim): # , recurse_to_kernels=False): # , key=None):
+    def __init__(self, block_dim):
         self.block_dim = block_dim
-        # self.remove_loop = True
 
     def transform_subroutine(self, routine, **kwargs):
-
         role = kwargs['role']
         targets = tuple(str(t).lower() for t in as_tuple(kwargs.get('targets', None)))
         if role == 'driver':
@@ -541,15 +593,10 @@ def arg_to_local_var(routine, var):
             type=routine.variable_map[var.name].type.clone(intent=None)),)
 
     def local_var(self, call, var):
-        # if var.name in call.arg_map:
-        #     print(f"arg to local var [1] {var} | {call.arg_map[var.name]}")
-        #     self.arg_to_local_var(call.routine, call.arg_map[var.name])
-        # elif var.name in call.routine.arguments:
         if var.name in call.routine.arguments:
             self.arg_to_local_var(call.routine, var)
         else:
             call.routine.variables += (var.clone(scope=call.routine),)
-            # (routine.variable_map[var.name].clone(scope=call.routine))
 
     @staticmethod
     def remove_openmp_pragmas(routine):
@@ -565,21 +612,16 @@ def generate_pragma(loop):
 
     def process_driver(self, routine, targets):
         # find block loops
-        # with pragmas_attached(routine, (ir.Loop, ir.CallStatement)):
-        # if True:
         loops = FindNodes(ir.Loop).visit(routine.body)
         loops = [loop for loop in loops if loop.variable == self.block_dim.index
                 or loop.variable in self.block_dim._index_aliases]
-        # if True:
-        # loop_map = {}
-        # ignore_routine = []
         driver_loop_map = {}
         to_local_var = {}
         processed_routines = ()
         calls = ()
         for loop in loops:
             lower_loop = False
-            for call in  FindNodes(ir.CallStatement).visit(loop.body): #visit(routine.body):
+            for call in  FindNodes(ir.CallStatement).visit(loop.body):
                 if str(call.name).lower() not in targets:
                     continue
                 lower_loop = True
@@ -590,7 +632,7 @@ def process_driver(self, routine, targets):
                 # replace/substitute variables according to the caller-callee argument map
                 loop_to_lower = SubstituteExpressions(call_arg_map).visit(loop_to_lower)
                 # remove calls that are not within targets # TODO: rather a hack to remove
-                #  "CALL TIMER%THREAD_LOG(TID, IGPC=ICEND)"
+                #  e.g., "CALL TIMER%THREAD_LOG(TID, IGPC=ICEND)"
                 calls_within_loop = [_call for _call in  FindNodes(ir.CallStatement).visit(loop_to_lower.body)
                         if str(_call.name).lower() not in targets]
                 loop_to_lower = Transformer({call: None for call in calls_within_loop}).visit(loop_to_lower)
@@ -630,10 +672,9 @@ def process_driver(self, routine, targets):
             if lower_loop:
                 driver_loop_map[loop] = loop.body
         routine.body = Transformer(driver_loop_map).visit(routine.body)
-        for call in calls: #  FindNodes(ir.CallStatement).visit(routine.body):
+        for call in calls:
             if str(call.name).lower() not in targets:
                 continue
-            # self.local_var(routine, call, loop.variable)
             for var in to_local_var[call.routine.name]:
                 self.local_var(call, var)
         # TODO: remove
diff --git a/loki/transformations/data_offload.py b/loki/transformations/data_offload.py
index b1d2f5ad2..b247115fc 100644
--- a/loki/transformations/data_offload.py
+++ b/loki/transformations/data_offload.py
@@ -833,8 +833,8 @@ def _get_symbols(self, successors):
         """
         Get module variables/symbols (grouped by routine/successor).
         """
-        defines_symbols = CaseInsensitiveDict() # {} # CaseInsensitiveDict()
-        uses_symbols = CaseInsensitiveDict() # {} # CaseInsensitiveDict()
+        defines_symbols = CaseInsensitiveDict()
+        uses_symbols = CaseInsensitiveDict()
         for item in successors:
             if not isinstance(item, ProcedureItem):
                 continue
diff --git a/loki/transformations/single_column/base.py b/loki/transformations/single_column/base.py
index 113652255..42f192804 100644
--- a/loki/transformations/single_column/base.py
+++ b/loki/transformations/single_column/base.py
@@ -51,6 +51,7 @@ def is_elemental(routine):
         Check whether :any:`Subroutine` ``routine`` is an elemental routine.
         Need for distinguishing elemental and non-elemental function to transform
         those in a different way.
+
         Parameters
         ----------
         routine: :any:`Subroutine`
@@ -61,18 +62,18 @@ def is_elemental(routine):
                 return True
         return False
 
-    @staticmethod
-    def check_array_dimensions_in_calls(routine):
-        calls = FindNodes(ir.CallStatement).visit(routine.body)
-        for call in calls:
-            for arg in call.arguments:
-                if isinstance(arg, sym.Array):
-                    if any(dim == sym.RangeIndex((None, None)) for dim in arg.dimensions):
-                        return False
-        return True
-
     @staticmethod
     def remove_dimensions(routine, calls_only=False):
+        """
+        Remove colon notation from array dimensions within :any:`Subroutine` ``routine``.
+        E.g., convert two-dimensional array ``arr2d(:,:)`` to ``arr2d`` or
+        ``arr3d(:,:,:)`` to ``arr3d``, but NOT e.g., ``arr(1,:,:)``.
+
+        Parameters
+        ----------
+        routine: :any:`Subroutine`
+            The subroutine to check
+        """
         if calls_only:
             calls = FindNodes(ir.CallStatement).visit(routine.body)
             for call in calls:
@@ -87,7 +88,6 @@ def remove_dimensions(routine, calls_only=False):
                     else:
                         arguments += (arg,)
                 call._update(arguments=arguments)
-
         else:
             arrays = [var for var in FindVariables(unique=False).visit(routine.body) if isinstance(var, sym.Array)]
             array_map = {}
@@ -99,6 +99,16 @@ def remove_dimensions(routine, calls_only=False):
 
     @staticmethod
     def explicit_dimensions(routine):
+        """
+        Make dimensions of arrays explicit within :any:`Subroutine` ``routine``.
+        E.g., convert two-dimensional array ``arr2d`` to ``arr2d(:,:)`` or
+        ``arr3d`` to ``arr3d(:,:,:)``.
+
+        Parameters
+        ----------
+        routine: :any:`Subroutine`
+            The subroutine to check
+        """
         arrays = [var for var in FindVariables(unique=False).visit(routine.body) if isinstance(var, sym.Array)]
         array_map = {}
         for array in arrays:
diff --git a/loki/transformations/single_column/scc_cuf.py b/loki/transformations/single_column/scc_cuf.py
index 06954450f..7636f30be 100644
--- a/loki/transformations/single_column/scc_cuf.py
+++ b/loki/transformations/single_column/scc_cuf.py
@@ -28,7 +28,6 @@
     'HoistTemporaryArraysPragmaOffloadTransformation',
     'SccLowLevelLaunchConfiguration',
     'SccLowLevelDataOffload',
-
 ]
 
 
@@ -126,9 +125,55 @@ def device_subroutine_prefix(routine, depth):
         routine.prefix += ("ATTRIBUTES(DEVICE)",)
 
 class SccLowLevelLaunchConfiguration(Transformation):
+    """
+    Part of the pipeline for generating Single Column Coalesced
+    Low Level GPU (CUDA Fortran, CUDA C, HIP, ...) for block-indexed gridpoint/single-column
+    routines (responsible for the launch configuration including the chevron notation).
+    """
 
-    def __init__(self, horizontal, vertical, block_dim, transformation_type='parametrise',
-                 derived_types=None, mode="CUF"):
+    def __init__(self, horizontal, vertical, block_dim, transformation_type='parametrise', mode="CUF"):
+        """
+        Part of the pipeline for generating Single Column Coalesced
+        Low Level GPU (CUDA Fortran, CUDA C, HIP, ...) for block-indexed gridpoint/single-column
+        routines responsible for the launch configuration including the chevron notation.
+
+        .. note::
+            In dependence of the transformation type ``transformation_type``, further
+            transformations are necessary:
+
+            * ``transformation_type = 'parametrise'`` requires a subsequent
+              :any:`ParametriseTransformation` transformation with the necessary information
+              to parametrise (at least) the ``vertical`` `size`
+            * ``transformation_type = 'hoist'`` requires subsequent :any:`HoistVariablesAnalysis`
+              and :class:`HoistVariablesTransformation` transformations (e.g.
+              :any:`HoistTemporaryArraysAnalysis` for analysis and
+              :any:`HoistTemporaryArraysTransformationDeviceAllocatable` or
+              :any:`HoistTemporaryArraysPragmaOffloadTransformation` for synthesis)
+
+        Parameters
+        ----------
+        horizontal : :any:`Dimension`
+            :any:`Dimension` object describing the variable conventions used in code
+            to define the horizontal data dimension and iteration space.
+        vertical : :any:`Dimension`
+            :any:`Dimension` object describing the variable conventions used in code
+            to define the vertical dimension, as needed to decide array privatization.
+        block_dim : :any:`Dimension`
+            :any:`Dimension` object to define the blocking dimension
+            to use for hoisted column arrays if hoisting is enabled.
+        transformation_type : str
+            Kind of transformation/Handling of temporaries/local arrays
+
+            - `parametrise`: parametrising the array dimensions to make the vertical dimension
+              a compile-time constant
+            - `hoist`: host side hoisting of (relevant) arrays
+        mode: str
+            Mode/language to target
+
+            - `CUF` - CUDA Fortran
+            - `CUDA` - CUDA C
+            - `HIP` - HIP
+        """
         self.horizontal = horizontal
         self.vertical = vertical
         self.block_dim = block_dim
@@ -143,12 +188,6 @@ def __init__(self, horizontal, vertical, block_dim, transformation_type='paramet
         self.transformation_description = {'parametrise': 'parametrised array dimensions of local arrays',
                                            'hoist': 'host side hoisted local arrays'}
 
-        if derived_types is None:
-            self.derived_types = ()
-        else:
-            self.derived_types = [_.upper() for _ in derived_types]
-        self.derived_type_variables = ()
-
     def transform_subroutine(self, routine, **kwargs):
 
         item = kwargs.get('item', None)
@@ -464,9 +503,58 @@ def driver_launch_configuration(self, routine, block_dim, targets=None):
 
 
 class SccLowLevelDataOffload(Transformation):
+    """
+    Part of the pipeline for generating Single Column Coalesced
+    Low Level GPU (CUDA Fortran, CUDA C, HIP, ...) for block-indexed gridpoint/single-column
+    routines (responsible for the data offload).
+    """
 
     def __init__(self, horizontal, vertical, block_dim, transformation_type='parametrise',
                  derived_types=None, mode="CUF"):
+        """
+        Part of the pipeline for generating Single Column Coalesced
+        Low Level GPU (CUDA Fortran, CUDA C, HIP, ...) for block-indexed gridpoint/single-column
+        routines responsible for the data offload..
+
+        .. note::
+            In dependence of the transformation type ``transformation_type``, further
+            transformations are necessary:
+
+            * ``transformation_type = 'parametrise'`` requires a subsequent
+              :any:`ParametriseTransformation` transformation with the necessary information
+              to parametrise (at least) the ``vertical`` `size`
+            * ``transformation_type = 'hoist'`` requires subsequent :any:`HoistVariablesAnalysis`
+              and :class:`HoistVariablesTransformation` transformations (e.g.
+              :any:`HoistTemporaryArraysAnalysis` for analysis and
+              :any:`HoistTemporaryArraysTransformationDeviceAllocatable` or
+              :any:`HoistTemporaryArraysPragmaOffloadTransformation` for synthesis)
+
+        Parameters
+        ----------
+        horizontal : :any:`Dimension`
+            :any:`Dimension` object describing the variable conventions used in code
+            to define the horizontal data dimension and iteration space.
+        vertical : :any:`Dimension`
+            :any:`Dimension` object describing the variable conventions used in code
+            to define the vertical dimension, as needed to decide array privatization.
+        block_dim : :any:`Dimension`
+            :any:`Dimension` object to define the blocking dimension
+            to use for hoisted column arrays if hoisting is enabled.
+        derived_types: tuple
+            Derived types that are relevant
+        transformation_type : str
+            Kind of transformation/Handling of temporaries/local arrays
+
+            - `parametrise`: parametrising the array dimensions to make the vertical dimension
+              a compile-time constant
+            - `hoist`: host side hoisting of (relevant) arrays
+        mode: str
+            Mode/language to target
+
+            - `CUF` - CUDA Fortran
+            - `CUDA` - CUDA C
+            - `HIP` - HIP
+        """
         self.horizontal = horizontal
         self.vertical = vertical
         self.block_dim = block_dim
@@ -709,8 +797,6 @@ def driver_device_variables(self, routine, targets=None):
                         pragma_map[pragma] = as_tuple(copy_end_pragmas)
                 if pragma_map:
                     routine.body = Transformer(pragma_map).visit(routine.body)
-
-            # return
         else:
             # Declaration
             routine.spec.append(ir.Comment(''))
diff --git a/loki/transformations/single_column/scc_low_level.py b/loki/transformations/single_column/scc_low_level.py
index e035ce6d6..0fbdb66c3 100644
--- a/loki/transformations/single_column/scc_low_level.py
+++ b/loki/transformations/single_column/scc_low_level.py
@@ -48,7 +48,6 @@ def inline_elemental_kernel(routine, **kwargs):
 class InlineTransformation(Transformation):
 
     def transform_subroutine(self, routine, **kwargs):
-        # inline_elemental_kernel(routine, **kwargs)
         role = kwargs['role']
 
         if role == 'kernel':
@@ -56,6 +55,72 @@ def transform_subroutine(self, routine, **kwargs):
             inline_constant_parameters(routine, external_only=True)
             inline_elemental_functions(routine)
 
+
+"""
+The basic Single Column Coalesced low-level GPU via CUDA-Fortran (SCC-CUF).
+
+This tranformation will convert kernels with innermost vectorisation
+along a common horizontal dimension to a GPU-friendly loop-layout via
+loop inversion and local array variable demotion. The resulting kernel
+remains "vector-parallel", but with the ``horizontal`` loop as the
+outermost iteration dimension (as far as data dependencies
+allow). This allows local temporary arrays to be demoted to scalars,
+where possible.
+
+Kernels are specified via ``'GLOBAL'`` and the number of threads that
+execute the kernel for a given call is specified via the chevron syntax.
+
+This :any:`Pipeline` applies the following :any:`Transformation`
+classes in sequence:
+1. :any:`SCCBaseTransformation` - Ensure utility variables and resolve
+   problematic code constructs.
+2. :any:`SCCDevectorTransformation` - Remove horizontal vector loops.
+3. :any:`SCCDemoteTransformation` - Demote local temporary array
+   variables where appropriate.
+4. :any:`SCCRevectorTransformation` - Re-insert the vecotr loops outermost,
+   according to identified vector sections.
+5. :any:`LowerBlockIndexTransformation` - Lower the block index (for
+   array argument definitions).
+6. :any:`InjectBlockIndexTransformation` - Complete the previous step
+   and inject the block index for the relevant arrays.
+7. :any:`LowerBlockLoopTransformation` - Lower the block loop
+   from driver to kernel(s).
+8. :any:`SCCLowLevelLaunchConfiguration` - Create launch configuration
+   and related things.
+9. :any:`SCCLowLevelDataOffload` - Create/handle data offload
+   and related things.
+
+Parameters
+----------
+horizontal : :any:`Dimension`
+    :any:`Dimension` object describing the variable conventions used in code
+    to define the horizontal data dimension and iteration space.
+block_dim : :any:`Dimension`
+    Optional ``Dimension`` object to define the blocking dimension
+    to use for hoisted column arrays if hoisting is enabled.
+directive : string or None
+    Directives flavour to use for parallelism annotations; either
+    ``'openacc'`` or ``None``.
+trim_vector_sections : bool
+    Flag to trigger trimming of extracted vector sections to remove
+    nodes that are not assignments involving vector parallel arrays.
+demote_local_arrays : bool
+    Flag to trigger local array demotion to scalar variables where possible
+derived_types: tuple
+    List of relevant derived types
+transformation_type : str
+    Kind of transformation/Handling of temporaries/local arrays
+
+    - `parametrise`: parametrising the array dimensions to make the vertical dimension
+      a compile-time constant
+    - `hoist`: host side hoisting of (relevant) arrays
+mode: str
+    Mode/language to target
+
+    - `CUF` - CUDA Fortran
+    - `CUDA` - CUDA C
+    - `HIP` - HIP
+"""
 SCCLowLevelCuf = partial(
     Pipeline, classes=(
         SCCBaseTransformation,
@@ -70,6 +135,50 @@ def transform_subroutine(self, routine, **kwargs):
     )
 )
 
+"""
+The Single Column Coalesced low-level GPU via CUDA-Fortran (SCC-CUF)
+handling temporaries via parametrisation.
+
+For details of the kernel and driver-side transformations, please
+refer to :any:`SCCLowLevelCuf`.
+
+In addition, this pipeline will invoke
+:any:`ParametriseTransformation` to parametrise relevant array
+dimensions to allow having temporary arrays.
+
+Parameters
+----------
+horizontal : :any:`Dimension`
+    :any:`Dimension` object describing the variable conventions used in code
+    to define the horizontal data dimension and iteration space.
+block_dim : :any:`Dimension`
+    Optional ``Dimension`` object to define the blocking dimension
+    to use for hoisted column arrays if hoisting is enabled.
+directive : string or None
+    Directives flavour to use for parallelism annotations; either
+    ``'openacc'`` or ``None``.
+trim_vector_sections : bool
+    Flag to trigger trimming of extracted vector sections to remove
+    nodes that are not assignments involving vector parallel arrays.
+demote_local_arrays : bool
+    Flag to trigger local array demotion to scalar variables where possible
+derived_types: tuple
+    List of relevant derived types
+transformation_type : str
+    Kind of transformation/Handling of temporaries/local arrays
+
+    - `parametrise`: parametrising the array dimensions to make the vertical dimension
+      a compile-time constant
+    - `hoist`: host side hoisting of (relevant) arrays
+mode: str
+    Mode/language to target
+
+    - `CUF` - CUDA Fortran
+    - `CUDA` - CUDA C
+    - `HIP` - HIP
+dic2p: dict
+    Dictionary of variable names and corresponding values to be parametrised.
+"""
 SCCLowLevelCufParametrise = partial(
     Pipeline, classes=(
         SCCBaseTransformation,
@@ -85,6 +194,49 @@ def transform_subroutine(self, routine, **kwargs):
     )
 )
 
+"""
+The Single Column Coalesced low-level GPU via CUDA-Fortran (SCC-CUF)
+handling temporaries via hoisting.
+
+For details of the kernel and driver-side transformations, please
+refer to :any:`SCCLowLevelCuf`.
+
+In addition, this pipeline will invoke
+:any:`HoistTemporaryArraysAnalysis` and
+:any:`HoistTemporaryArraysDeviceAllocatableTransformation`
+to hoist temporary arrays.
+
+Parameters
+----------
+horizontal : :any:`Dimension`
+    :any:`Dimension` object describing the variable conventions used in code
+    to define the horizontal data dimension and iteration space.
+block_dim : :any:`Dimension`
+    Optional ``Dimension`` object to define the blocking dimension
+    to use for hoisted column arrays if hoisting is enabled.
+directive : string or None
+    Directives flavour to use for parallelism annotations; either
+    ``'openacc'`` or ``None``.
+trim_vector_sections : bool
+    Flag to trigger trimming of extracted vector sections to remove
+    nodes that are not assignments involving vector parallel arrays.
+demote_local_arrays : bool
+    Flag to trigger local array demotion to scalar variables where possible
+derived_types: tuple
+    List of relevant derived types
+transformation_type : str
+    Kind of transformation/Handling of temporaries/local arrays
+
+    - `parametrise`: parametrising the array dimensions to make the vertical dimension
+      a compile-time constant
+    - `hoist`: host side hoisting of (relevant) arrays
+mode: str
+    Mode/language to target
+
+    - `CUF` - CUDA Fortran
+    - `CUDA` - CUDA C
+    - `HIP` - HIP
+"""
 SCCLowLevelCufHoist = partial(
     Pipeline, classes=(
         SCCBaseTransformation,
@@ -101,6 +253,83 @@ def transform_subroutine(self, routine, **kwargs):
     )
 )
 
+"""
+The Single Column Coalesced low-level GPU via low-level C-style
+kernel language (CUDA, HIP, ...) handling temporaries via parametrisation.
+
+This tranformation will convert kernels with innermost vectorisation
+along a common horizontal dimension to a GPU-friendly loop-layout via
+loop inversion and local array variable demotion. The resulting kernel
+remains "vector-parallel", but with the ``horizontal`` loop as the
+outermost iteration dimension (as far as data dependencies
+allow). This allows local temporary arrays to be demoted to scalars,
+where possible.
+
+Kernels are specified via e.g., ``'__global__'`` and the number of threads that
+execute the kernel for a given call is specified via the chevron syntax.
+
+This :any:`Pipeline` applies the following :any:`Transformation`
+classes in sequence:
+1. :any:`InlineTransformation` - Inline constants and elemental
+   functions.
+2. :any:`GlobalVariableAnalysis` - Analysis of global variables
+3. :any:`GlobalVarHoistTransformation` - Hoist global variables
+   to the driver.
+4. :any:`DerivedTypeArgumentsTransformation` - Flatten derived types/
+   remove derived types from procedure signatures by replacing the
+   (relevant) derived type arguments by its member variables.
+5. :any:`SCCBaseTransformation` - Ensure utility variables and resolve
+   problematic code constructs.
+6. :any:`SCCDevectorTransformation` - Remove horizontal vector loops.
+7. :any:`SCCDemoteTransformation` - Demote local temporary array
+   variables where appropriate.
+8. :any:`SCCRevectorTransformation` - Re-insert the vecotr loops outermost,
+   according to identified vector sections.
+9. :any:`LowerBlockIndexTransformation` - Lower the block index (for
+   array argument definitions).
+10. :any:`InjectBlockIndexTransformation` - Complete the previous step
+   and inject the block index for the relevant arrays.
+11. :any:`LowerBlockLoopTransformation` - Lower the block loop
+   from driver to kernel(s).
+12. :any:`SCCLowLevelLaunchConfiguration` - Create launch configuration
+   and related things.
+13. :any:`SCCLowLevelDataOffload` - Create/handle data offload
+   and related things.
+14. :any:`ParametriseTransformation` - Parametrise according to ``dic2p``.
+
+Parameters
+----------
+horizontal : :any:`Dimension`
+    :any:`Dimension` object describing the variable conventions used in code
+    to define the horizontal data dimension and iteration space.
+block_dim : :any:`Dimension`
+    Optional ``Dimension`` object to define the blocking dimension
+    to use for hoisted column arrays if hoisting is enabled.
+directive : string or None
+    Directives flavour to use for parallelism annotations; either
+    ``'openacc'`` or ``None``.
+trim_vector_sections : bool
+    Flag to trigger trimming of extracted vector sections to remove
+    nodes that are not assignments involving vector parallel arrays.
+demote_local_arrays : bool
+    Flag to trigger local array demotion to scalar variables where possible
+derived_types: tuple
+    List of relevant derived types
+transformation_type : str
+    Kind of transformation/Handling of temporaries/local arrays
+
+    - `parametrise`: parametrising the array dimensions to make the vertical dimension
+      a compile-time constant
+    - `hoist`: host side hoisting of (relevant) arrays
+mode: str
+    Mode/language to target
+
+    - `CUF` - CUDA Fortran
+    - `CUDA` - CUDA C
+    - `HIP` - HIP
+dic2p: dict
+    Dictionary of variable names and corresponding values to be parametrised.
+"""
 SCCLowLevelParametrise = partial(
     Pipeline, classes=(
         InlineTransformation,
@@ -120,6 +349,83 @@ def transform_subroutine(self, routine, **kwargs):
     )
 )
 
+"""
+The Single Column Coalesced low-level GPU via low-level C-style
+kernel language (CUDA, HIP, ...) handling temporaries via parametrisation.
+
+This tranformation will convert kernels with innermost vectorisation
+along a common horizontal dimension to a GPU-friendly loop-layout via
+loop inversion and local array variable demotion. The resulting kernel
+remains "vector-parallel", but with the ``horizontal`` loop as the
+outermost iteration dimension (as far as data dependencies
+allow). This allows local temporary arrays to be demoted to scalars,
+where possible.
+
+Kernels are specified via e.g., ``'__global__'`` and the number of threads that
+execute the kernel for a given call is specified via the chevron syntax.
+
+This :any:`Pipeline` applies the following :any:`Transformation`
+classes in sequence:
+1. :any:`InlineTransformation` - Inline constants and elemental
+   functions.
+2. :any:`GlobalVariableAnalysis` - Analysis of global variables
+3. :any:`GlobalVarHoistTransformation` - Hoist global variables
+   to the driver.
+4. :any:`DerivedTypeArgumentsTransformation` - Flatten derived types/
+   remove derived types from procedure signatures by replacing the
+   (relevant) derived type arguments by its member variables.
+5. :any:`SCCBaseTransformation` - Ensure utility variables and resolve
+   problematic code constructs.
+6. :any:`SCCDevectorTransformation` - Remove horizontal vector loops.
+7. :any:`SCCDemoteTransformation` - Demote local temporary array
+   variables where appropriate.
+8. :any:`SCCRevectorTransformation` - Re-insert the vecotr loops outermost,
+   according to identified vector sections.
+9. :any:`LowerBlockIndexTransformation` - Lower the block index (for
+   array argument definitions).
+10. :any:`InjectBlockIndexTransformation` - Complete the previous step
+   and inject the block index for the relevant arrays.
+11. :any:`LowerBlockLoopTransformation` - Lower the block loop
+   from driver to kernel(s).
+12. :any:`SCCLowLevelLaunchConfiguration` - Create launch configuration
+   and related things.
+13. :any:`SCCLowLevelDataOffload` - Create/handle data offload
+   and related things.
+14. :any:`HoistTemporaryArraysAnalysis` - Analysis part of hoisting.
+15. :any:`HoistTemporaryArraysPragmaOffloadTransformation` - Syntesis
+    part of hoisting.
+
+Parameters
+----------
+horizontal : :any:`Dimension`
+    :any:`Dimension` object describing the variable conventions used in code
+    to define the horizontal data dimension and iteration space.
+block_dim : :any:`Dimension`
+    Optional ``Dimension`` object to define the blocking dimension
+    to use for hoisted column arrays if hoisting is enabled.
+directive : string or None
+    Directives flavour to use for parallelism annotations; either
+    ``'openacc'`` or ``None``.
+trim_vector_sections : bool
+    Flag to trigger trimming of extracted vector sections to remove
+    nodes that are not assignments involving vector parallel arrays.
+demote_local_arrays : bool
+    Flag to trigger local array demotion to scalar variables where possible
+derived_types: tuple
+    List of relevant derived types
+transformation_type : str
+    Kind of transformation/Handling of temporaries/local arrays
+
+    - `parametrise`: parametrising the array dimensions to make the vertical dimension
+      a compile-time constant
+    - `hoist`: host side hoisting of (relevant) arrays
+mode: str
+    Mode/language to target
+
+    - `CUF` - CUDA Fortran
+    - `CUDA` - CUDA C
+    - `HIP` - HIP
+"""
 SCCLowLevelHoist = partial(
     Pipeline, classes=(
         InlineTransformation,
@@ -135,7 +441,7 @@ def transform_subroutine(self, routine, **kwargs):
         LowerBlockLoopTransformation,
         SccLowLevelLaunchConfiguration,
         SccLowLevelDataOffload,
-        HoistTemporaryArraysAnalysis,
+      HoistTemporaryArraysAnalysis,
         HoistTemporaryArraysPragmaOffloadTransformation
     )
 )
diff --git a/loki/transformations/tests/test_block_index_inject.py b/loki/transformations/tests/test_block_index_inject.py
index 35d6d41b7..0c8f9dd49 100644
--- a/loki/transformations/tests/test_block_index_inject.py
+++ b/loki/transformations/tests/test_block_index_inject.py
@@ -390,8 +390,6 @@ def test_blockview_to_fieldview_exception(frontend, horizontal):
                                            targets=('compute',))
 
 
-# @pytest.mark.parametrize('frontend', available_frontends(xfail=[(OMNI,
-#                          'OMNI correctly complains about rank mismatch in assignment.')]))
 @pytest.mark.parametrize('frontend', available_frontends())
 @pytest.mark.parametrize('block_dim_arg', (False, True))
 @pytest.mark.parametrize('recurse_to_kernels', (False, True))
@@ -538,18 +536,7 @@ def test_simple_lower_loop(blocking, frontend, block_dim_arg, recurse_to_kernels
     assert blocking.index not in kernel_mod['kernel'].arguments
     assert blocking.index in kernel_mod['kernel'].variable_map
 
-    # print(f"---------------\ndriver:\n{fgen(driver)}")
-    # print(f"---------------\nkernel:\n{fgen(kernel_mod['kernel'])}")
-    # print(f"---------------\nkernel:\n{fgen(nested_kernel_mod['compute'])}")
-    # print("\n\n")
-    # print(f"kernel.symbol_table: {dict(kernel['kernel'].symbol_attrs)}")
-    # assigns = FindNodes(Assignment).visit(kernel.body)
-    # assert assigns[0].lhs == 'var(:,:,ibl)'
-    # calls = FindNodes(CallStatement).visit(kernel.body)
-    # assert 'var(:,:,ibl)' in calls[0].arguments
-
-# @pytest.mark.parametrize('frontend', available_frontends(xfail=[(OMNI,
-#                          'OMNI correctly complains about rank mismatch in assignment.')]))
+
 @pytest.mark.parametrize('frontend', available_frontends())
 @pytest.mark.parametrize('recurse_to_kernels', (False, True))
 @pytest.mark.parametrize('targets', (('kernel', 'another_kernel', 'compute'), ('kernel', 'compute')))
@@ -753,9 +740,3 @@ def test_lower_loop(blocking, frontend, recurse_to_kernels, targets):
     assert blocking.size in kernel_mod['kernel'].arguments
     assert blocking.index not in kernel_mod['kernel'].arguments
     assert blocking.index in kernel_mod['kernel'].variable_map
-
-    # print(f"---------------\ndriver:\n{fgen(driver)}")
-    # print(f"---------------\nkernel:\n{fgen(kernel_mod['kernel'])}")
-    # print(f"---------------\nkernel:\n{fgen(another_kernel_mod['another_kernel'])}")
-    # print(f"---------------\nkernel:\n{fgen(nested_kernel_mod['compute'])}")
-    # print("\n\n")
diff --git a/loki/transformations/transpile/fortran_c.py b/loki/transformations/transpile/fortran_c.py
index 210084770..811a442b3 100644
--- a/loki/transformations/transpile/fortran_c.py
+++ b/loki/transformations/transpile/fortran_c.py
@@ -116,15 +116,13 @@ class FortranCTransformation(Transformation):
     # Set of standard module names that have no C equivalent
     __fortran_intrinsic_modules = ['ISO_FORTRAN_ENV', 'ISO_C_BINDING']
 
-    def __init__(self, inline_elementals=True, use_c_ptr=False, path=None, language='c'): # codegen=cgen):
+    def __init__(self, inline_elementals=True, use_c_ptr=False, path=None, language='c'):
         self.inline_elementals = inline_elementals
         self.use_c_ptr = use_c_ptr
         self.path = Path(path) if path is not None else None
-        # self.codegen = codegen
         self.language = language.lower()
         assert self.language in ['c', 'cuda'] # , 'hip']
 
-        # self.langgen = cgen if self.language == 'c' else cppgen
         if self.language == 'c':
             self.codegen = cgen
         elif self.language == 'cuda':
@@ -195,21 +193,14 @@ def transform_subroutine(self, routine, **kwargs):
             contains = Section(body=(Intrinsic('CONTAINS'), wrapper))
             self.wrapperpath = (path/wrapper.name.lower()).with_suffix('.F90')
             module = Module(name=f'{wrapper.name.upper()}_MOD', contains=contains)
-            ### new ###
             module.spec = Section(body=(Import(module='iso_c_binding'),))
 
             # Generate C source file from Loki IR
             c_kernel = self.generate_c_kernel(routine, targets=targets)
             self.c_path = (path/c_kernel.name.lower()).with_suffix('.c')
-            # Sourcefile.to_file(source=self.langgen(c_kernel), path=self.c_path)
-            ### end new ###
             Sourcefile.to_file(source=fgen(module), path=self.wrapperpath)
 
             # Generate C source file from Loki IR
-            # c_kernel = self.generate_c_kernel(routine)
-            # self.c_path = (path/c_kernel.name.lower()).with_suffix('.c')
-            # Sourcefile.to_file(source=self.codegen(c_kernel), path=self.c_path)
-            #### new ####
             # c_kernel.spec.prepend(Import(module=f'{c_kernel.name.lower()}.h', c_import=True))
             for successor in successors:
                 if self.language == 'c':
@@ -219,32 +210,19 @@ def transform_subroutine(self, routine, **kwargs):
                     if not isinstance(successor, ProcedureItem):
                         c_kernel.spec.prepend(Import(module=f'{successor.routine.name.lower()}_c.c', c_import=True))
 
-            # Sourcefile.to_file(source=self.langgen(c_kernel), path=self.c_path)
-
             if depth == 1:
                 if self.language != 'c':
                     c_kernel_launch = c_kernel.clone(name=f"{c_kernel.name}_launch", prefix="extern_c")
                     self.generate_c_kernel_launch(c_kernel_launch, c_kernel)
                     self.c_path = (path/c_kernel_launch.name.lower()).with_suffix('.h')
                     Sourcefile.to_file(source=self.codegen(c_kernel_launch, extern=True), path=self.c_path)
-            else:
-                # TODO: nested device routines ..., should work correctly?
-                # c_kernel_header = c_kernel.clone(name=f"{c_kernel.name}", prefix="header_only device")
-                pass
-                # c_kernel_header = c_kernel.clone(name=f"{routine.name.lower()}", prefix="header_only device")
-                # TODO: this shouldn't be necessary anymore usinge self.codegen(..., header=True)
-                # self.generate_c_kernel_header(c_kernel_header)
-                # self.c_path =(path/c_kernel_header.name.lower()).with_suffix('.h')
-                # Sourcefile.to_file(source=self.codegen(c_kernel_header), path=self.c_path)
-
-            ## new
+
             assignments = FindNodes(Assignment).visit(c_kernel.body)
             assignments2remove = ['griddim', 'blockdim']
             assignment_map = {assignment: None for assignment in assignments
                     if assignment.lhs.name.lower() in assignments2remove}
             # == block_dim.index.lower() or assignment.lhs.name.lower() in [_.lower() for _ in horizontal.bounds]}
             c_kernel.body = Transformer(assignment_map).visit(c_kernel.body)
-            ## end:new
 
             if depth > 1:
                 c_kernel.spec.prepend(Import(module=f'{c_kernel.name.lower()}.h', c_import=True))
@@ -253,7 +231,6 @@ def transform_subroutine(self, routine, **kwargs):
             self.c_path = (path/c_kernel.name.lower()).with_suffix('.h')
             Sourcefile.to_file(source=self.codegen(c_kernel, header=True), path=self.c_path)
             self.c_path = (path/c_kernel.name.lower()).with_suffix('.c')
-            #### end new ####
 
     def c_struct_typedef(self, derived):
         """
@@ -679,13 +656,6 @@ def generate_c_kernel_launch(self, kernel_launch, kernel, **kwargs):
         kernel_call = kernel.clone()
         call_arguments = []
         for arg in kernel_call.arguments:
-            # TODO: ?
-            # if False: # isinstance(arg, Array):
-            #     # _type = arg.type.clone(pointer=False)
-            #     # kernel_call.symbol_attrs[arg.name] = _type
-            #     call_arguments.append(arg.clone(dimensions=None))
-            #     # call_arguments.append(arg.clone(dimensions=None, type=_type))
-            # else:
             call_arguments.append(arg)
 
         griddim = None