diff --git a/loki/batch/tests/test_scheduler.py b/loki/batch/tests/test_scheduler.py
index bd411ca95..b3d42948b 100644
--- a/loki/batch/tests/test_scheduler.py
+++ b/loki/batch/tests/test_scheduler.py
@@ -2887,21 +2887,22 @@ def test_pipeline_config_compose(config):
     assert isinstance(pipeline, Pipeline)
 
     # Check that the pipeline is correctly composed
-    assert len(pipeline.transformations) == 7
+    assert len(pipeline.transformations) == 8
     assert type(pipeline.transformations[0]).__name__ == 'RemoveCodeTransformation'
-    assert type(pipeline.transformations[1]).__name__ == 'SCCBaseTransformation'
-    assert type(pipeline.transformations[2]).__name__ == 'SCCDevectorTransformation'
-    assert type(pipeline.transformations[3]).__name__ == 'SCCDemoteTransformation'
-    assert type(pipeline.transformations[4]).__name__ == 'SCCRevectorTransformation'
-    assert type(pipeline.transformations[5]).__name__ == 'SCCAnnotateTransformation'
-    assert type(pipeline.transformations[6]).__name__ == 'ModuleWrapTransformation'
+    assert type(pipeline.transformations[1]).__name__ == 'SCCFuseVerticalLoops'
+    assert type(pipeline.transformations[2]).__name__ == 'SCCBaseTransformation'
+    assert type(pipeline.transformations[3]).__name__ == 'SCCDevectorTransformation'
+    assert type(pipeline.transformations[4]).__name__ == 'SCCDemoteTransformation'
+    assert type(pipeline.transformations[5]).__name__ == 'SCCRevectorTransformation'
+    assert type(pipeline.transformations[6]).__name__ == 'SCCAnnotateTransformation'
+    assert type(pipeline.transformations[7]).__name__ == 'ModuleWrapTransformation'
 
     # Check for some specified and default constructor flags
     assert pipeline.transformations[0].call_names == ('dr_hook',)
     assert pipeline.transformations[0].remove_imports is False
-    assert isinstance(pipeline.transformations[1].horizontal, Dimension)
-    assert pipeline.transformations[1].horizontal.size == 'KLON'
-    assert pipeline.transformations[1].horizontal.index == 'JL'
-    assert pipeline.transformations[1].directive == 'openacc'
-    assert pipeline.transformations[2].trim_vector_sections is True
-    assert pipeline.transformations[6].replace_ignore_items is True
+    assert isinstance(pipeline.transformations[2].horizontal, Dimension)
+    assert pipeline.transformations[2].horizontal.size == 'KLON'
+    assert pipeline.transformations[2].horizontal.index == 'JL'
+    assert pipeline.transformations[2].directive == 'openacc'
+    assert pipeline.transformations[3].trim_vector_sections is True
+    assert pipeline.transformations[7].replace_ignore_items is True
diff --git a/loki/transformations/single_column/__init__.py b/loki/transformations/single_column/__init__.py
index d5fe934d1..2ed513c63 100644
--- a/loki/transformations/single_column/__init__.py
+++ b/loki/transformations/single_column/__init__.py
@@ -13,3 +13,4 @@
 from loki.transformations.single_column.scc_cuf import * # noqa
 from loki.transformations.single_column.vector import * # noqa
 from loki.transformations.single_column.scc_low_level import * # noqa
+from loki.transformations.single_column.vertical import * # noqa
diff --git a/loki/transformations/single_column/scc.py b/loki/transformations/single_column/scc.py
index 3b7f7d7dc..001ddca14 100644
--- a/loki/transformations/single_column/scc.py
+++ b/loki/transformations/single_column/scc.py
@@ -19,7 +19,7 @@
 from loki.transformations.single_column.vector import (
     SCCDevectorTransformation, SCCDemoteTransformation, SCCRevectorTransformation
 )
-
+from loki.transformations.single_column.vertical import SCCFuseVerticalLoops
 
 __all__ = [
     'SCCVectorPipeline', 'SCCHoistPipeline', 'SCCStackPipeline', 'SCCRawStackPipeline'
@@ -75,6 +75,7 @@
 """
 SCCVectorPipeline = partial(
     Pipeline, classes=(
+        SCCFuseVerticalLoops,
         SCCBaseTransformation,
         SCCDevectorTransformation,
         SCCDemoteTransformation,
@@ -121,6 +122,7 @@
 """
 SCCHoistPipeline = partial(
     Pipeline, classes=(
+        SCCFuseVerticalLoops,
         SCCBaseTransformation,
         SCCDevectorTransformation,
         SCCDemoteTransformation,
@@ -166,6 +168,7 @@
 """
 SCCStackPipeline = partial(
     Pipeline, classes=(
+        SCCFuseVerticalLoops,
         SCCBaseTransformation,
         SCCDevectorTransformation,
         SCCDemoteTransformation,
diff --git a/loki/transformations/single_column/tests/test_scc_vertical.py b/loki/transformations/single_column/tests/test_scc_vertical.py
new file mode 100644
index 000000000..3a476a882
--- /dev/null
+++ b/loki/transformations/single_column/tests/test_scc_vertical.py
@@ -0,0 +1,255 @@
+# (C) Copyright 2018- ECMWF.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+import pytest
+
+from loki import Subroutine, Dimension
+from loki.frontend import available_frontends
+from loki.ir import FindNodes, Loop, FindVariables
+from loki.transformations.single_column import SCCFuseVerticalLoops
+
+
+@pytest.fixture(scope='module', name='horizontal')
+def fixture_horizontal():
+    return Dimension(
+        name='horizontal', size='nlon', index='jl',
+        bounds=('start', 'end'), aliases=('nproma',)
+    )
+
+@pytest.fixture(scope='module', name='horizontal_bounds_aliases')
+def fixture_horizontal_bounds_aliases():
+    return Dimension(
+        name='horizontal_bounds_aliases', size='nlon', index='jl',
+        bounds=('start', 'end'), aliases=('nproma',),
+        bounds_aliases=('bnds%start', 'bnds%end')
+    )
+
+@pytest.fixture(scope='module', name='vertical')
+def fixture_vertical():
+    return Dimension(name='vertical', size='nz', index='jk', aliases=('nlev',))
+
+@pytest.fixture(scope='module', name='blocking')
+def fixture_blocking():
+    return Dimension(name='blocking', size='nb', index='b')
+
+
+@pytest.mark.parametrize('frontend', available_frontends())
+def test_simple_scc_fuse_verticals_transformation(frontend, horizontal, vertical):
+    """
+    Test simple example of vertical loop fusion and demotion of temporaries.
+    """
+
+    fcode_kernel = """
+  SUBROUTINE compute_column(start, end, nlon, nz, q, t)
+    INTEGER, INTENT(IN) :: start, end  ! Iteration indices
+    INTEGER, INTENT(IN) :: nlon, nz    ! Size of the horizontal and vertical
+    REAL, INTENT(INOUT) :: t(nlon,nz)
+    REAL, INTENT(INOUT) :: q(nlon,nz)
+    REAL :: temp_t(nlon, nz)
+    REAL :: temp_q(nlon, nz)
+    INTEGER :: jl, JK
+    REAL :: c
+
+    c = 5.345
+    !$loki loop-fusion group(1)
+    DO JK = 1, nz
+      DO jl = start, end
+        temp_t(jl, jk) = c
+        temp_q(jl, JK) = c
+      END DO
+    END DO
+
+    !$loki loop-fusion group(1)
+    DO jk = 2, nz
+      DO jl = start, end
+        t(jl, jk) = temp_t(jl, jk) * jk
+        q(jl, jk) = q(jl, jk-1) + t(jl, jk) * temp_q(jl, jk)
+      END DO
+    END DO
+
+    ! The scaling is purposefully upper-cased
+    DO JL = START, END
+      Q(JL, NZ) = Q(JL, NZ) * C
+    END DO
+  END SUBROUTINE compute_column
+"""
+    kernel = Subroutine.from_source(fcode_kernel, frontend=frontend)
+
+    # Ensure we have three loops in the kernel prior to transformation
+    kernel_loops = FindNodes(Loop).visit(kernel.body)
+    assert len(kernel_loops) == 5
+
+    # no-op as 'compute_column' is not within apply_to
+    SCCFuseVerticalLoops(vertical=vertical, apply_to=('another_kernel',)).apply(kernel, role='kernel')
+    # Ensure we have three loops in the kernel prior to transformation
+    kernel_loops = FindNodes(Loop).visit(kernel.body)
+    assert len(kernel_loops) == 5
+
+    # actual loop fusion and demotion ... (as apply_to is not provided and therefore all routines are dispatched)
+    SCCFuseVerticalLoops(vertical=vertical).apply(kernel, role='kernel')
+
+    # Ensure the two vertical loops are fused
+    kernel_loops = FindNodes(Loop).visit(kernel.body)
+    assert len(kernel_loops) == 4
+    assert kernel_loops[0].variable.name.lower() == 'jk'
+    assert kernel_loops[-1].variable.name.lower() == 'jl'
+    assert len([loop for loop in kernel_loops if loop.variable.name.lower() == 'jk']) == 1
+    kernel_var_map = kernel.variable_map
+    assert kernel_var_map['temp_t'].shape == (horizontal.size,)
+    assert kernel_var_map['temp_q'].shape == (horizontal.size,)
+    kernel_vars = [var for var in FindVariables().visit(kernel.body) if var.name.lower() in ['temp_t', 'temp_q']]
+    for var in kernel_vars:
+        assert var.shape == (horizontal.size,)
+        assert var.dimensions == (horizontal.index,)
+
+
+@pytest.mark.parametrize('frontend', available_frontends())
+@pytest.mark.parametrize('ignore', (False, True))
+def test_scc_fuse_verticals_transformation(frontend, horizontal, vertical, ignore):
+    """
+    Test somewhat more sophisticated example of vertical loop fusion
+    and demotion of temporaries.
+    """
+
+    fcode_kernel = f"""
+  SUBROUTINE compute_column(start, end, nlon, nz, q, t)
+    INTEGER, INTENT(IN) :: start, end  ! Iteration indices
+    INTEGER, INTENT(IN) :: nlon, nz    ! Size of the horizontal and vertical
+    REAL, INTENT(INOUT) :: t(nlon,nz)
+    REAL, INTENT(INOUT) :: q(nlon,nz)
+    REAL :: temp_t(nlon, nz)
+    REAL :: temp_t2(nlon, nz)
+    REAL :: temp_q(nlon, nz)
+    REAL :: temp_q2(nlon, nz)
+    REAL :: temp_cld(nlon, nz, 5)
+    INTEGER :: jl, jk, jm
+    REAL :: c
+
+    {'!$loki k-caching ignore(temp_q2)' if ignore else ''}
+
+    c = 5.345
+    !$loki loop-fusion group(1-init)
+    DO jk = 1, nz
+      DO jl = start, end
+        temp_t(jl, jk) = c
+        temp_q(jl, jk) = c
+        temp_t2(jl, jk) = 2*c
+      END DO
+    END DO
+
+    !$loki loop-fusion group(1)
+    !$loki loop-interchange
+    DO jm=1,5
+      DO jk = 1, nz
+        DO jl = start, end
+          temp_cld(jl, jk, jm) = 3.1415
+        END DO
+      END DO
+    END DO
+
+    DO jl = start, end
+      q(jl, jk) = 0.
+    END DO
+
+    !$loki loop-fusion group(1) insert
+    DO jk = 2, nz
+      DO jl = start, end
+        t(jl, jk) = temp_t(jl, jk) * temp_t2(jl, jk-1) * temp_cld(jl, jk, 1)
+        q(jl, jk) = q(jl, jk-1) + t(jl, jk) * temp_q(jl, jk)
+      END DO
+    END DO
+
+    CALL nested_kernel(start, end, nlon, nz, q)
+
+    !$loki loop-fusion group(2)
+    DO jk = 2, nz
+      DO jl = start, end
+        temp_q2(jl, jk) = 3.1415
+      END DO
+    END DO
+
+    !$loki loop-fusion group(2)
+    DO jk = 2, nz
+      DO jl = start, end
+        t(jl, jk) = t(jl, jk) + 3.1415
+        q(jl, jk) = q(jl, jk-1) + t(jl, jk) * temp_q(jl, jk) + temp_q2(jl, jk)
+      END DO
+    END DO
+
+    ! The scaling is purposefully upper-cased
+    DO JL = START, END
+      Q(JL, NZ) = Q(JL, NZ) * C
+    END DO
+  END SUBROUTINE compute_column
+"""
+
+
+    kernel = Subroutine.from_source(fcode_kernel, frontend=frontend)
+
+    # Ensure we have three loops in the kernel prior to transformation
+    kernel_loops = FindNodes(Loop).visit(kernel.body)
+    assert len(kernel_loops) == 13
+    SCCFuseVerticalLoops(vertical=vertical).apply(kernel, role='kernel')
+
+    # Ensure the two vertical loops are fused
+    kernel_loops = FindNodes(Loop).visit(kernel.body)
+    assert len(kernel_loops) == 12
+    vertical_loops = [loop for loop in kernel_loops if loop.variable.name.lower() == vertical.index]
+    assert len(vertical_loops) == 3
+
+    shape1D = (horizontal.size,)
+    shape2D = (horizontal.size, vertical.size)
+    dimension1D = (horizontal.index,)
+    dimension2D = (horizontal.index,vertical.index)
+    dimension2DI1 = (horizontal.index, f'{vertical.index}-1')
+
+    vertical_loop_0_vars = FindVariables().visit(vertical_loops[0].body)
+    vertical_loop_0_var_names = [var.name.lower() for var in vertical_loop_0_vars]
+    vertical_loop_0_var_dict = dict(zip(vertical_loop_0_var_names, vertical_loop_0_vars))
+    assert 'temp_t2' in vertical_loop_0_var_names
+    assert 'temp_t' not in vertical_loop_0_var_names
+    assert 'temp_q' not in vertical_loop_0_var_names
+    assert 'temp_q2' not in vertical_loop_0_var_names
+    assert 'temp_cld' not in vertical_loop_0_var_names
+    assert vertical_loop_0_var_dict['temp_t2'].shape == shape2D
+    assert vertical_loop_0_var_dict['temp_t2'].dimensions == dimension2D
+
+    vertical_loop_1_vars = FindVariables().visit(vertical_loops[1].body)
+    vertical_loop_1_var_names = [var.name.lower() for var in vertical_loop_1_vars]
+    vertical_loop_1_var_dict = dict(zip(vertical_loop_1_var_names, vertical_loop_1_vars))
+    assert 'temp_t2' in vertical_loop_1_var_names
+    assert 'temp_t' in vertical_loop_1_var_names
+    assert 'temp_q' in vertical_loop_1_var_names
+    assert 'temp_q2' not in vertical_loop_1_vars
+    assert 'temp_cld' in vertical_loop_1_var_names
+    assert vertical_loop_1_var_dict['temp_t2'].shape == shape2D
+    assert vertical_loop_1_var_dict['temp_t2'].dimensions == dimension2DI1
+    assert vertical_loop_1_var_dict['temp_t'].shape == shape1D
+    assert vertical_loop_1_var_dict['temp_t'].dimensions == dimension1D
+    assert vertical_loop_1_var_dict['temp_q'].shape == shape2D
+    assert vertical_loop_1_var_dict['temp_q'].dimensions == dimension2D
+    assert vertical_loop_1_var_dict['temp_cld'].shape == shape1D + (5,)
+    assert vertical_loop_1_var_dict['temp_cld'].dimensions in (dimension1D + (1,), dimension1D + ('jm',))
+
+    vertical_loop_2_vars = FindVariables().visit(vertical_loops[2].body)
+    vertical_loop_2_var_names = [var.name.lower() for var in vertical_loop_2_vars]
+    vertical_loop_2_var_dict = dict(zip(vertical_loop_2_var_names, vertical_loop_2_vars))
+    assert 'temp_t2' not in vertical_loop_2_var_names
+    assert 'temp_t' not in vertical_loop_2_var_names
+    assert 'temp_q' in vertical_loop_2_var_names
+    assert 'temp_q2' in vertical_loop_2_var_names
+    assert 'temp_cld' not in vertical_loop_2_var_names
+    assert vertical_loop_2_var_dict['temp_q'].shape == shape2D
+    assert vertical_loop_2_var_dict['temp_q'].dimensions == dimension2D
+    assert vertical_loop_2_var_dict['temp_q2'].shape == shape2D if ignore else shape1D
+    assert vertical_loop_2_var_dict['temp_q2'].dimensions == dimension2D if ignore else dimension1D
+
+    kernel_var_map = kernel.variable_map
+    assert kernel_var_map['temp_t'].shape == shape1D
+    assert kernel_var_map['temp_t2'].shape == shape2D
+    assert kernel_var_map['temp_q'].shape == shape2D
+    assert kernel_var_map['temp_q2'].shape == shape2D if ignore else shape1D
diff --git a/loki/transformations/single_column/vertical.py b/loki/transformations/single_column/vertical.py
new file mode 100644
index 000000000..d19f11818
--- /dev/null
+++ b/loki/transformations/single_column/vertical.py
@@ -0,0 +1,224 @@
+# (C) Copyright 2018- ECMWF.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+from loki.batch import Transformation
+from loki.expression import (
+    symbols as sym
+)
+from loki.ir import (
+    nodes as ir, FindNodes, Transformer,
+    is_loki_pragma, pragmas_attached,
+    get_pragma_parameters, FindVariables
+)
+from loki.tools import as_tuple, CaseInsensitiveDict
+from loki.transformations.transform_loop import loop_fusion, loop_interchange
+from loki.transformations.array_indexing import demote_variables
+from loki.transformations.utilities import get_local_arrays
+from loki.logging import info
+
+__all__ = ['SCCFuseVerticalLoops']
+
+class SCCFuseVerticalLoops(Transformation):
+    """
+    A transformation to fuse vertical loops and demote temporaries in the vertical
+    dimension if possible.
+
+    .. note::
+        This transfomation currently relies on pragmas being inserted in the input
+        source files. Relevant pragmas are `!$loki loop-interchange` to expose the
+        vertical loops (in case vertical loops are nested) and `!$loki loop-fusion`
+        possibly grouped via `group(<group name>)`. Further, if there are loops
+        that initialize multilevel arrays (`jk +/- 1`) it is possible to mark those
+        loops as `!$loki loop-fusion group(<group-name>-init)`. This allows to split
+        the relevant node and moves the initialization of those arrays to the top of
+        the group.
+
+    Parameters
+    ----------
+    vertical : :any:`Dimension`
+        :any:`Dimension` object describing the variable conventions used in code
+        to define the vertical data dimension and iteration space.
+    apply_to : list of str, optional
+        list of routines to apply this transformation to, if not provided or None 
+        apply to all routines (default: None)
+    """
+
+    def __init__(self, vertical=None, apply_to=None):
+        self.vertical = vertical
+        self.apply_to = apply_to or ()
+
+    def transform_subroutine(self, routine, **kwargs):
+        """
+        Fuse vertical loops and demote temporaries in the vertical dimension
+        if possible.
+
+        Parameters
+        ----------
+        routine : :any:`Subroutine`
+            The subroutine in the vertical loops should be fused and
+            temporaries be demoted.
+        """
+        if self.vertical is None:
+            info('[SCCFuseVerticalLoops] is not applied as the vertical dimension is not defined!')
+            return
+        role = kwargs['role']
+        if role == 'kernel':
+            if self.apply_to and routine.name.lower() not in self.apply_to:
+                return
+            self.process_kernel(routine)
+
+    def process_kernel(self, routine):
+        """
+        Current logic (simplified):
+
+        1. loop interchange to expose vertical loops
+        2. fuse vertical loops (possibly into multiple groups)
+        3. find local arrays to be demoted and apply heuristics to check whether this is safe
+        4. demote those arrays which are safe to be demoted
+        """
+        # find local arrays with a vertical dimension
+        relevant_local_arrays = self.find_relevant_local_arrays(routine)
+        # find "multilevel" thus "jk +/- 1" arrays
+        multilevel_relevant_local_arrays = self.identify_multilevel_arrays(relevant_local_arrays)
+        # loop interchange to expose vertical loops as outermost loops
+        loop_interchange(routine)
+        # handle initialization of arrays "jk +/- 1" arrays
+        multilevel_relevant_local_arrays_names = set(arr.name.lower() for arr in multilevel_relevant_local_arrays)
+        self.correct_init_of_multilevel_arrays(routine, multilevel_relevant_local_arrays_names)
+        # fuse vertical loops
+        loop_fusion(routine)
+        # demote in vertical dimension if possible
+        relevant_local_arrays_names = set(arr.name.lower() for arr in relevant_local_arrays)
+        demote_candidates = relevant_local_arrays_names - multilevel_relevant_local_arrays_names
+        # check which variables are safe to demote in the vertical
+        safe_to_demote = self.check_safe_to_demote(routine, demote_candidates)
+        # demote locals in vertical dimension
+        dimensions_to_demote = self.vertical.size_expressions + (f"{self.vertical.size}+1",)
+        demote_variables(routine, safe_to_demote, dimensions_to_demote)
+
+    def check_safe_to_demote(self, routine, demote_candidates):
+        """
+        Check whether variables that are candidates to be demoted in the vertical dimension are really
+        safe to be demoted.
+
+        Current heuristic: If the candidate is used in more than one vertical loop, assume it is NOT safe
+        to demote!
+        """
+        loop_var_map = CaseInsensitiveDict()
+        with pragmas_attached(routine, ir.Loop):
+            for loop in FindNodes(ir.Loop).visit(routine.body):
+                if loop.variable == self.vertical.index:
+                    if is_loki_pragma(loop.pragma, starts_with='fused-loop'):
+                        parameters = get_pragma_parameters(loop.pragma, starts_with='fused-loop')
+                        group = parameters.get('group', 'default')
+                        if group == 'ignore':
+                            continue
+                        for var in FindVariables().visit(loop.body):
+                            if isinstance(var, sym.Array):
+                                loop_var_map.setdefault(var.name, set()).add(group)
+
+        safe_to_demote = ()
+        for var in demote_candidates:
+            if var in loop_var_map and len(loop_var_map[var]) <= 1:
+                safe_to_demote += (var,)
+
+        return safe_to_demote
+
+    def find_relevant_local_arrays(self, routine):
+        """
+        Find local arrays/temporaries that do have the vertical dimension.
+        """
+        # local/temporary arrays
+        local_arrays = get_local_arrays(routine, routine.body)
+        # only those with the vertical size within shape
+        relevant_local_arrays = [arr for arr in local_arrays if self.vertical.size.lower()
+                in FindVariables().visit(arr.shape)]
+        # filter arrays to be ignored (for whatever reason)
+        ignore_names = self.find_local_arrays_to_be_ignored(routine)
+        if ignore_names:
+            relevant_local_arrays = [arr for arr in relevant_local_arrays if arr.name.lower() not in ignore_names]
+        return relevant_local_arrays
+
+    def find_local_arrays_to_be_ignored(self, routine):
+        """
+        Identify variables to be ignore regarding demotion for whatever reason.
+
+        Reasons are:
+
+        * explicitly marked to be ignored via pragmas within the input source file, e.g.,
+          'loki k-caching ignore(var1, var2, ...)'
+        """
+        ignore = ()
+        pragmas = FindNodes(ir.Pragma).visit(routine.body)
+        # look for 'loki k-caching ignore(var1, var2, ...)' pragmas within routine and ignore those vars
+        for pragma in pragmas:
+            if is_loki_pragma(pragma, starts_with='k-caching'):
+                if pragma_ignore := get_pragma_parameters(pragma, starts_with='k-caching').get('ignore', None):
+                    ignore += as_tuple(v.strip() for v in pragma_ignore.split(','))
+        ignore_names = set(var.lower() for var in ignore)
+        return ignore_names
+
+    def identify_multilevel_arrays(self, local_arrays):
+        """
+        Identify local arrays/temporaries that have an access in the vertical dimension
+        that is different to '<vertical.index>', e.g., '<vertical.index> +/- 1'
+        """
+        multilevel_local_arrays = []
+        for arr in local_arrays:
+            for dim in arr.dimensions:
+                if self.vertical.index in FindVariables().visit(dim):
+                    # dim is not equal to vertical.index e.g., vertical.index +/- 1
+                    if dim != self.vertical.index:
+                        multilevel_local_arrays.append(arr)
+        return multilevel_local_arrays
+
+    def correct_init_of_multilevel_arrays(self, routine, multilevel_local_arrays):
+        """
+        Possibly handle initialization of those multilevel local arrays via
+        splitting relevant loops or rather creating a new node with the relevant
+        nodes moved to the newly created loop.
+
+        .. note::
+            This relies on pragmas being inserted in the input source code!
+        """
+        loop_map = {}
+        # find/identify loops with pragma 'loop-fusion group(<group-name>-init)'
+        with pragmas_attached(routine, ir.Loop):
+            loop_map = {}
+            for loop in FindNodes(ir.Loop).visit(routine.body):
+                if is_loki_pragma(loop.pragma, starts_with='loop-fusion'):
+                    parameters = get_pragma_parameters(loop.pragma, starts_with='loop-fusion')
+                    group = parameters.get('group', 'default')
+                    if group.endswith('-init'):
+                        nodes_to_be_moved = ()
+                        nodes = FindNodes(ir.Assignment).visit(loop.body)
+                        node_map = {}
+                        node_map_init = {}
+                        # find nodes that have multilevel arrays
+                        for node in nodes:
+                            node_vars = FindVariables().visit(node)
+                            if any(node_var.name.lower() in multilevel_local_arrays for node_var in node_vars):
+                                nodes_to_be_moved += (node,)
+                                node_map[node] = None
+                            else:
+                                node_map_init[node] = None
+                        # split the loop/create a new node to move those nodes with
+                        # multilevel arrays to the new node
+                        if nodes_to_be_moved:
+                            pragmas = loop.pragma
+                            new_pragmas = [pragma.clone(content=pragma.content.replace('-init', '')) if '-init'
+                                    in pragma.content else pragma for pragma in pragmas]
+                            # init part
+                            transf_init = Transformer(node_map_init).visit(loop.clone(\
+                                    pragma=as_tuple(ir.Pragma(keyword='loki',
+                                        content='fused-loop group(ignore)'))))
+                            # rest of the original node/loop
+                            transf_orig = Transformer(node_map).visit(loop.clone(pragma=as_tuple(new_pragmas)))
+                            loop_map[loop] = (ir.Comment('! Loki generated loop for init ...'),
+                                    transf_init, transf_orig)
+            if loop_map:
+                routine.body = Transformer(loop_map).visit(routine.body)
diff --git a/loki/transformations/tests/test_transform_loop.py b/loki/transformations/tests/test_transform_loop.py
index a2209f4c3..b7e04c607 100644
--- a/loki/transformations/tests/test_transform_loop.py
+++ b/loki/transformations/tests/test_transform_loop.py
@@ -13,10 +13,10 @@
 from loki import Subroutine
 from loki.build import jit_compile, clean_test
 from loki.expression import symbols as sym
-from loki.frontend import available_frontends, OMNI
+from loki.frontend import available_frontends
 from loki.ir import (
     is_loki_pragma, pragmas_attached, FindNodes, Loop, Conditional,
-    Assignment
+    Assignment, FindVariables
 )
 
 from loki.transformations.transform_loop import (
@@ -215,6 +215,59 @@ def test_transform_loop_interchange_project(tmp_path, frontend):
     clean_test(interchanged_filepath)
 
 
+@pytest.mark.parametrize('frontend', available_frontends())
+@pytest.mark.parametrize('insert_loc', (False, True))
+def test_transform_loop_fuse_ordering(frontend, insert_loc):
+    """
+    Apply loop fusion for two loops with matching iteration spaces.
+    """
+    fcode = f"""
+subroutine transform_loop_fuse_ordering(a, b, c, n, m)
+  integer, intent(out) :: a(m, n), b(m, n), c(m)
+  integer, intent(in) :: n, m
+  integer :: i
+
+  !$loki loop-fusion group(1)
+  !$loki loop-interchange
+  do j=1,m
+    do i=1,n
+      a(j, i) = i + j
+    enddo
+  end do
+
+  !$loki loop-fusion group(1)
+  do i=1,n
+    do j=1,m
+      a(j, i) = i + j
+    enddo
+  end do
+  
+  do j=1,m
+    c(j) = j
+  enddo
+
+  !$loki loop-fusion group(1) {'insert-loc' if insert_loc else ''}
+  do i=1,n-1
+    do j=1,m
+      b(j, i) = n-i+1 + j
+    enddo
+  end do
+end subroutine transform_loop_fuse_ordering
+"""
+    routine = Subroutine.from_source(fcode, frontend=frontend)
+    assert len(FindNodes(Loop).visit(routine.body)) == 7
+    loop_interchange(routine)
+    loop_fusion(routine)
+    loops = FindNodes(Loop).visit(routine.body)
+    assert len(loops) == 5
+    loop_0_vars = [var.name.lower() for var in FindVariables().visit(loops[0].body)]
+    if insert_loc:
+        assert loops[0].variable.name.lower() == 'j'
+        assert 'c' in loop_0_vars
+    else:
+        assert loops[0].variable.name.lower() == 'i'
+        assert 'c' not in loop_0_vars
+
 @pytest.mark.parametrize('frontend', available_frontends())
 def test_transform_loop_fuse_matching(tmp_path, frontend):
     """
@@ -1631,7 +1684,7 @@ def test_transform_loop_unroll(tmp_path, frontend):
     # Test the reference solution
     s = np.zeros(1)
     function(s=s)
-    assert s == sum([x + 1 for x in range(1, 11)])
+    assert s == sum(x + 1 for x in range(1, 11))
 
     # Apply transformation
     assert len(FindNodes(Loop).visit(routine.body)) == 1
@@ -1644,7 +1697,7 @@ def test_transform_loop_unroll(tmp_path, frontend):
     # Test transformation
     s = np.zeros(1)
     unrolled_function(s=s)
-    assert s == sum([x + 1 for x in range(1, 11)])
+    assert s == sum(x + 1 for x in range(1, 11))
 
     clean_test(filepath)
     clean_test(unrolled_filepath)
@@ -1673,7 +1726,7 @@ def test_transform_loop_unroll_step(tmp_path, frontend):
     # Test the reference solution
     s = np.zeros(1)
     function(s=s)
-    assert s == sum([x + 1 for x in range(1, 11, 2)])
+    assert s == sum(x + 1 for x in range(1, 11, 2))
 
     # Apply transformation
     assert len(FindNodes(Loop).visit(routine.body)) == 1
@@ -1686,7 +1739,7 @@ def test_transform_loop_unroll_step(tmp_path, frontend):
     # Test transformation
     s = np.zeros(1)
     unrolled_function(s=s)
-    assert s == sum([x + 1 for x in range(1, 11, 2)])
+    assert s == sum(x + 1 for x in range(1, 11, 2))
 
     clean_test(filepath)
     clean_test(unrolled_filepath)
@@ -1717,7 +1770,7 @@ def test_transform_loop_unroll_non_literal_range(tmp_path, frontend):
     # Test the reference solution
     s = np.zeros(1)
     function(s=s)
-    assert s == sum([x + 1 for x in range(1, 11)])
+    assert s == sum(x + 1 for x in range(1, 11))
 
     # Apply transformation
     assert len(FindNodes(Loop).visit(routine.body)) == 1
@@ -1730,7 +1783,7 @@ def test_transform_loop_unroll_non_literal_range(tmp_path, frontend):
     # Test transformation
     s = np.zeros(1)
     unrolled_function(s=s)
-    assert s == sum([x + 1 for x in range(1, 11)])
+    assert s == sum(x + 1 for x in range(1, 11))
 
     clean_test(filepath)
     clean_test(unrolled_filepath)
@@ -1762,7 +1815,7 @@ def test_transform_loop_unroll_nested(tmp_path, frontend):
     # Test the reference solution
     s = np.zeros(1)
     function(s=s)
-    assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))])
+    assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6)))
 
     # Apply transformation
     assert len(FindNodes(Loop).visit(routine.body)) == 2
@@ -1775,7 +1828,7 @@ def test_transform_loop_unroll_nested(tmp_path, frontend):
     # Test transformation
     s = np.zeros(1)
     unrolled_function(s=s)
-    assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))])
+    assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6)))
 
     clean_test(filepath)
     clean_test(unrolled_filepath)
@@ -1807,7 +1860,7 @@ def test_transform_loop_unroll_nested_restricted_depth(tmp_path, frontend):
     # Test the reference solution
     s = np.zeros(1)
     function(s=s)
-    assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))])
+    assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6)))
 
     # Apply transformation
     assert len(FindNodes(Loop).visit(routine.body)) == 2
@@ -1820,7 +1873,7 @@ def test_transform_loop_unroll_nested_restricted_depth(tmp_path, frontend):
     # Test transformation
     s = np.zeros(1)
     unrolled_function(s=s)
-    assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))])
+    assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6)))
 
     clean_test(filepath)
     clean_test(unrolled_filepath)
@@ -1854,7 +1907,7 @@ def test_transform_loop_unroll_nested_restricted_depth_unrollable(tmp_path, fron
     # Test the reference solution
     s = np.zeros(1)
     function(s=s)
-    assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))])
+    assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6)))
 
     # Apply transformation
     assert len(FindNodes(Loop).visit(routine.body)) == 2
@@ -1867,7 +1920,7 @@ def test_transform_loop_unroll_nested_restricted_depth_unrollable(tmp_path, fron
     # Test transformation
     s = np.zeros(1)
     unrolled_function(s=s)
-    assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))])
+    assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6)))
 
     clean_test(filepath)
     clean_test(unrolled_filepath)
@@ -1915,7 +1968,7 @@ def test_transform_loop_unroll_nested_counters(tmp_path, frontend):
     # Test transformation
     s = np.zeros(1)
     unrolled_function(s=s)
-    assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 11)) if b <= a])
+    assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 11)) if b <= a)
 
     clean_test(filepath)
     clean_test(unrolled_filepath)
@@ -1953,7 +2006,7 @@ def test_transform_loop_unroll_nested_neighbours(tmp_path, frontend):
     # Test the reference solution
     s = np.zeros(1)
     function(s=s)
-    assert s == 2 * sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))])
+    assert s == 2 * sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6)))
     # Apply transformation
     assert len(FindNodes(Loop).visit(routine.body)) == 3
     loop_unroll(routine)
@@ -1965,7 +2018,7 @@ def test_transform_loop_unroll_nested_neighbours(tmp_path, frontend):
     # Test transformation
     s = np.zeros(1)
     unrolled_function(s=s)
-    assert s == 2 * sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))])
+    assert s == 2 * sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6)))
 
     clean_test(filepath)
     clean_test(unrolled_filepath)
diff --git a/loki/transformations/transform_loop.py b/loki/transformations/transform_loop.py
index a6f788522..29299b4eb 100644
--- a/loki/transformations/transform_loop.py
+++ b/loki/transformations/transform_loop.py
@@ -294,140 +294,147 @@ def loop_fusion(routine):
                 group = parameters.get('group', 'default')
                 fusion_groups[group] += [(loop, parameters)]
 
-    if not fusion_groups:
-        return
-
-    # Merge loops in each group and put them in the position of the group's first loop
-    for group, loop_parameter_lists in fusion_groups.items():
-        loop_list, parameters = zip(*loop_parameter_lists)
-
-        # First, determine the collapse depth and extract user-annotated loop ranges from pragmas
-        collapse = [param.get('collapse', None) for param in parameters]
-        if collapse != [collapse[0]] * len(collapse):
-            raise RuntimeError(f'Conflicting collapse values in group "{group}"')
-        collapse = int(collapse[0]) if collapse[0] is not None else 1
-
-        pragma_ranges = [pragma_ranges_to_loop_ranges(param, routine) for param in parameters]
-
-        # If we have a pragma somewhere with an explicit loop range, we use that for the fused loop
-        range_set = {r for r in pragma_ranges if r is not None}
-        if len(range_set) not in (0, 1):
-            raise RuntimeError(f'Pragma-specified loop ranges in group "{group}" do not match')
-
-        fusion_ranges = None
-        if range_set:
-            fusion_ranges = range_set.pop()
-
-        # Next, extract loop ranges for all loops in group and convert to iteration space
-        # polyhedrons for easier alignment
-        loop_variables, loop_ranges, loop_bodies = \
-                zip(*[get_loop_components(get_nested_loops(loop, collapse)) for loop in loop_list])
-        iteration_spaces = [Polyhedron.from_loop_ranges(variables, ranges)
-                            for variables, ranges in zip(loop_variables, loop_ranges)]
-
-        # Find the fused iteration space (if not given by a pragma)
-        if fusion_ranges is None:
-            fusion_ranges = []
-            for level in range(collapse):
-                lower_bounds, upper_bounds = [], []
-                ignored_variables = list(range(level+1, collapse))
-
-                for p in iteration_spaces:
-                    for bound in p.lower_bounds(level, ignored_variables):
-                        # Decide if we learn something new from this bound, which could be because:
-                        # (1) we don't have any bounds, yet
-                        # (2) bound is smaller than existing lower bounds (i.e. diff < 0)
-                        # (3) bound is not constant and none of the existing bounds are lower (i.e. diff >= 0)
-                        diff = [simplify(bound - b) for b in lower_bounds]
-                        is_any_negative = any(is_constant(d) and symbolic_op(d, op.lt, 0) for d in diff)
-                        is_any_not_negative = any(is_constant(d) and symbolic_op(d, op.ge, 0) for d in diff)
-                        is_new_bound = (not lower_bounds or is_any_negative or
-                                        (not is_constant(bound) and not is_any_not_negative))
-                        if is_new_bound:
-                            # Remove any lower bounds made redundant by bound:
-                            lower_bounds = [b for b, d in zip(lower_bounds, diff)
-                                            if not (is_constant(d) and symbolic_op(d, op.lt, 0))]
-                            lower_bounds += [bound]
-
-                    for bound in p.upper_bounds(level, ignored_variables):
-                        # Decide if we learn something new from this bound, which could be because:
-                        # (1) we don't have any bounds, yet
-                        # (2) bound is larger than existing upper bounds (i.e. diff > 0)
-                        # (3) bound is not constant and none of the existing bounds are larger (i.e. diff <= 0)
-                        diff = [simplify(bound - b) for b in upper_bounds]
-                        is_any_positive = any(is_constant(d) and symbolic_op(d, op.gt, 0) for d in diff)
-                        is_any_not_positive = any(is_constant(d) and symbolic_op(d, op.le, 0) for d in diff)
-                        is_new_bound = (not upper_bounds or is_any_positive or
-                                        (not is_constant(bound) and not is_any_not_positive))
-                        if is_new_bound:
-                            # Remove any lower bounds made redundant by bound:
-                            upper_bounds = [b for b, d in zip(upper_bounds, diff)
-                                            if not (is_constant(d) and symbolic_op(d, op.gt, 0))]
-                            upper_bounds += [bound]
-
-                if len(lower_bounds) == 1:
-                    lower_bounds = lower_bounds[0]
-                else:
-                    fct_symbol = sym.ProcedureSymbol('min', scope=routine)
-                    lower_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(lower_bounds))
-
-                if len(upper_bounds) == 1:
-                    upper_bounds = upper_bounds[0]
-                else:
-                    fct_symbol = sym.ProcedureSymbol('max', scope=routine)
-                    upper_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(upper_bounds))
-
-                fusion_ranges += [sym.LoopRange((lower_bounds, upper_bounds))]
-
-        # Align loop ranges and collect bodies
-        fusion_bodies = []
-        fusion_variables = loop_variables[0]
-        for idx, (variables, ranges, bodies, p) in enumerate(
-                zip(loop_variables, loop_ranges, loop_bodies, iteration_spaces)):
-            # TODO: This throws away anything that is not in the inner-most loop body.
-            body = flatten([Comment(f'! Loki loop-fusion - body {idx} begin'),
-                            bodies[-1],
-                            Comment(f'! Loki loop-fusion - body {idx} end')])
-
-            # Replace loop variables if necessary
-            var_map = {}
-            for loop_variable, fusion_variable in zip(variables, fusion_variables):
-                if loop_variable != fusion_variable:
-                    var_map.update({var: fusion_variable for var in FindVariables().visit(body)
-                                    if var.name.lower() == loop_variable.name})
-            if var_map:
-                body = SubstituteExpressions(var_map).visit(body)
-
-            # Wrap in conditional if loop bounds are different
-            conditions = []
-            for loop_range, fusion_range, variable in zip(ranges, fusion_ranges, fusion_variables):
-                if symbolic_op(loop_range.start, op.ne, fusion_range.start):
-                    conditions += [sym.Comparison(variable, '>=', loop_range.start)]
-                if symbolic_op(loop_range.stop, op.ne, fusion_range.stop):
-                    conditions += [sym.Comparison(variable, '<=', loop_range.stop)]
-            if conditions:
-                if len(conditions) == 1:
-                    condition = conditions[0]
-                else:
-                    condition = sym.LogicalAnd(as_tuple(conditions))
-                body = Conditional(condition=condition, body=as_tuple(body), else_body=())
+        if not fusion_groups:
+            return
+
+        # Merge loops in each group and put them in the position of the group's first loop
+        #  UNLESS 'insert-loc' location is specified for at least one of the group's fusion
+        #  pragmas, in this case the position is the first occurence of 'insert-loc' for each group
+        for group, loop_parameter_lists in fusion_groups.items():
+            loop_list, parameters = zip(*loop_parameter_lists)
+
+            # First, determine the collapse depth and extract user-annotated loop ranges from pragmas
+            collapse = [param.get('collapse', None) for param in parameters]
+            insert_locs = [param.get('insert-loc', False) for param in parameters]
+            if collapse != [collapse[0]] * len(collapse):
+                raise RuntimeError(f'Conflicting collapse values in group "{group}"')
+            collapse = int(collapse[0]) if collapse[0] is not None else 1
+
+            pragma_ranges = [pragma_ranges_to_loop_ranges(param, routine) for param in parameters]
+
+            # If we have a pragma somewhere with an explicit loop range, we use that for the fused loop
+            range_set = {r for r in pragma_ranges if r is not None}
+            if len(range_set) not in (0, 1):
+                raise RuntimeError(f'Pragma-specified loop ranges in group "{group}" do not match')
+
+            fusion_ranges = None
+            if range_set:
+                fusion_ranges = range_set.pop()
+
+            # Next, extract loop ranges for all loops in group and convert to iteration space
+            # polyhedrons for easier alignment
+            loop_variables, loop_ranges, loop_bodies = \
+                    zip(*[get_loop_components(get_nested_loops(loop, collapse)) for loop in loop_list])
+            iteration_spaces = [Polyhedron.from_loop_ranges(variables, ranges)
+                                for variables, ranges in zip(loop_variables, loop_ranges)]
+
+            # Find the fused iteration space (if not given by a pragma)
+            if fusion_ranges is None:
+                fusion_ranges = []
+                for level in range(collapse):
+                    lower_bounds, upper_bounds = [], []
+                    ignored_variables = list(range(level+1, collapse))
+
+                    for p in iteration_spaces:
+                        for bound in p.lower_bounds(level, ignored_variables):
+                            # Decide if we learn something new from this bound, which could be because:
+                            # (1) we don't have any bounds, yet
+                            # (2) bound is smaller than existing lower bounds (i.e. diff < 0)
+                            # (3) bound is not constant and none of the existing bounds are lower (i.e. diff >= 0)
+                            diff = [simplify(bound - b) for b in lower_bounds]
+                            is_any_negative = any(is_constant(d) and symbolic_op(d, op.lt, 0) for d in diff)
+                            is_any_not_negative = any(is_constant(d) and symbolic_op(d, op.ge, 0) for d in diff)
+                            is_new_bound = (not lower_bounds or is_any_negative or
+                                            (not is_constant(bound) and not is_any_not_negative))
+                            if is_new_bound:
+                                # Remove any lower bounds made redundant by bound:
+                                lower_bounds = [b for b, d in zip(lower_bounds, diff)
+                                                if not (is_constant(d) and symbolic_op(d, op.lt, 0))]
+                                lower_bounds += [bound]
+
+                        for bound in p.upper_bounds(level, ignored_variables):
+                            # Decide if we learn something new from this bound, which could be because:
+                            # (1) we don't have any bounds, yet
+                            # (2) bound is larger than existing upper bounds (i.e. diff > 0)
+                            # (3) bound is not constant and none of the existing bounds are larger (i.e. diff <= 0)
+                            diff = [simplify(bound - b) for b in upper_bounds]
+                            is_any_positive = any(is_constant(d) and symbolic_op(d, op.gt, 0) for d in diff)
+                            is_any_not_positive = any(is_constant(d) and symbolic_op(d, op.le, 0) for d in diff)
+                            is_new_bound = (not upper_bounds or is_any_positive or
+                                            (not is_constant(bound) and not is_any_not_positive))
+                            if is_new_bound:
+                                # Remove any lower bounds made redundant by bound:
+                                upper_bounds = [b for b, d in zip(upper_bounds, diff)
+                                                if not (is_constant(d) and symbolic_op(d, op.gt, 0))]
+                                upper_bounds += [bound]
 
-            fusion_bodies += [body]
-
-        # Create the nested fused loop and replace original loops
-        fusion_loop = flatten(fusion_bodies)
-        for fusion_variable, fusion_range in zip(reversed(fusion_variables), reversed(fusion_ranges)):
-            fusion_loop = Loop(variable=fusion_variable, body=as_tuple(fusion_loop), bounds=fusion_range)
+                    if len(lower_bounds) == 1:
+                        lower_bounds = lower_bounds[0]
+                    else:
+                        # TODO: could/should be ProcedureSymbol, however refer to issue: #390
+                        fct_symbol = sym.DeferredTypeSymbol(name='min', scope=routine)
+                        lower_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(lower_bounds))
 
-        comment = Comment(f'! Loki loop-fusion group({group})')
-        loop_map[loop_list[0]] = (comment, fusion_loop)
-        comment = Comment(f'! Loki loop-fusion group({group}) - loop hoisted')
-        loop_map.update({loop: comment for loop in loop_list[1:]})
+                    if len(upper_bounds) == 1:
+                        upper_bounds = upper_bounds[0]
+                    else:
+                        # TODO: could/should be ProcedureSymbol, however refer to issue: #390
+                        fct_symbol = sym.DeferredTypeSymbol(name='max', scope=routine)
+                        upper_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(upper_bounds))
 
-    # Apply transformation
-    routine.body = Transformer(loop_map).visit(routine.body)
-    info('%s: fused %d loops in %d groups.', routine.name,
-         sum(len(loop_list) for loop_list in fusion_groups.values()), len(fusion_groups))
+                    fusion_ranges += [sym.LoopRange((lower_bounds, upper_bounds))]
+
+            # Align loop ranges and collect bodies
+            fusion_bodies = []
+            fusion_variables = loop_variables[0]
+            for idx, (variables, ranges, bodies, p) in enumerate(
+                    zip(loop_variables, loop_ranges, loop_bodies, iteration_spaces)):
+                # TODO: This throws away anything that is not in the inner-most loop body.
+                body = flatten([Comment(f'! Loki loop-fusion - body {idx} begin'),
+                                bodies[-1],
+                                Comment(f'! Loki loop-fusion - body {idx} end')])
+
+                # Replace loop variables if necessary
+                var_map = {}
+                for loop_variable, fusion_variable in zip(variables, fusion_variables):
+                    if loop_variable != fusion_variable:
+                        var_map.update({var: fusion_variable for var in FindVariables().visit(body)
+                                        if var.name.lower() == loop_variable.name})
+                if var_map:
+                    body = SubstituteExpressions(var_map).visit(body)
+
+                # Wrap in conditional if loop bounds are different
+                conditions = []
+                for loop_range, fusion_range, variable in zip(ranges, fusion_ranges, fusion_variables):
+                    if symbolic_op(loop_range.start, op.ne, fusion_range.start):
+                        conditions += [sym.Comparison(variable, '>=', loop_range.start)]
+                    if symbolic_op(loop_range.stop, op.ne, fusion_range.stop):
+                        conditions += [sym.Comparison(variable, '<=', loop_range.stop)]
+                if conditions:
+                    if len(conditions) == 1:
+                        condition = conditions[0]
+                    else:
+                        condition = sym.LogicalAnd(as_tuple(conditions))
+                    body = Conditional(condition=condition, body=as_tuple(body), else_body=())
+
+                fusion_bodies += [body]
+
+            # Create the nested fused loop and replace original loops
+            fusion_loop = flatten(fusion_bodies)
+            for fusion_variable, fusion_range in zip(reversed(fusion_variables), reversed(fusion_ranges)):
+                fusion_loop = Loop(variable=fusion_variable, body=as_tuple(fusion_loop), bounds=fusion_range)
+
+            comment = Comment(f'! Loki loop-fusion group({group})')
+            insert_loc = insert_locs.index(None) if None in insert_locs else 0
+            loop_map[loop_list[insert_loc]] = (comment, Pragma(keyword='loki',
+                content=f'fused-loop group({group})'), fusion_loop)
+            comment = Comment(f'! Loki loop-fusion group({group}) - loop hoisted')
+            loop_map.update({loop: comment for i_loop, loop in enumerate(loop_list) if i_loop != insert_loc})
+
+        # Apply transformation
+        routine.body = Transformer(loop_map).visit(routine.body)
+        info('%s: fused %d loops in %d groups.', routine.name,
+             sum(len(loop_list) for loop_list in fusion_groups.values()), len(fusion_groups))
 
 
 class FissionTransformer(NestedMaskedTransformer):
diff --git a/scripts/loki_transform.py b/scripts/loki_transform.py
index f4d3bdccb..e01c68d1e 100644
--- a/scripts/loki_transform.py
+++ b/scripts/loki_transform.py
@@ -300,7 +300,7 @@ def convert(
         pipeline = scheduler.config.transformations.get('scc', None)
         if not pipeline:
             pipeline = SCCVectorPipeline(
-                horizontal=horizontal,
+                horizontal=horizontal, vertical=vertical,
                 block_dim=block_dim, directive=directive,
                 trim_vector_sections=trim_vector_sections
             )
@@ -310,7 +310,7 @@ def convert(
         pipeline = scheduler.config.transformations.get('scc-hoist', None)
         if not pipeline:
             pipeline = SCCHoistPipeline(
-                horizontal=horizontal,
+                horizontal=horizontal, vertical=vertical,
                 block_dim=block_dim, directive=directive,
                 dim_vars=(vertical.size,) if vertical else None,
                 trim_vector_sections=trim_vector_sections
@@ -321,7 +321,7 @@ def convert(
         pipeline = scheduler.config.transformations.get('scc-stack', None)
         if not pipeline:
             pipeline = SCCStackPipeline(
-                horizontal=horizontal,
+                horizontal=horizontal, vertical=vertical,
                 block_dim=block_dim, directive=directive,
                 check_bounds=False,
                 trim_vector_sections=trim_vector_sections