diff --git a/loki/batch/tests/test_scheduler.py b/loki/batch/tests/test_scheduler.py index bd411ca95..b3d42948b 100644 --- a/loki/batch/tests/test_scheduler.py +++ b/loki/batch/tests/test_scheduler.py @@ -2887,21 +2887,22 @@ def test_pipeline_config_compose(config): assert isinstance(pipeline, Pipeline) # Check that the pipeline is correctly composed - assert len(pipeline.transformations) == 7 + assert len(pipeline.transformations) == 8 assert type(pipeline.transformations[0]).__name__ == 'RemoveCodeTransformation' - assert type(pipeline.transformations[1]).__name__ == 'SCCBaseTransformation' - assert type(pipeline.transformations[2]).__name__ == 'SCCDevectorTransformation' - assert type(pipeline.transformations[3]).__name__ == 'SCCDemoteTransformation' - assert type(pipeline.transformations[4]).__name__ == 'SCCRevectorTransformation' - assert type(pipeline.transformations[5]).__name__ == 'SCCAnnotateTransformation' - assert type(pipeline.transformations[6]).__name__ == 'ModuleWrapTransformation' + assert type(pipeline.transformations[1]).__name__ == 'SCCFuseVerticalLoops' + assert type(pipeline.transformations[2]).__name__ == 'SCCBaseTransformation' + assert type(pipeline.transformations[3]).__name__ == 'SCCDevectorTransformation' + assert type(pipeline.transformations[4]).__name__ == 'SCCDemoteTransformation' + assert type(pipeline.transformations[5]).__name__ == 'SCCRevectorTransformation' + assert type(pipeline.transformations[6]).__name__ == 'SCCAnnotateTransformation' + assert type(pipeline.transformations[7]).__name__ == 'ModuleWrapTransformation' # Check for some specified and default constructor flags assert pipeline.transformations[0].call_names == ('dr_hook',) assert pipeline.transformations[0].remove_imports is False - assert isinstance(pipeline.transformations[1].horizontal, Dimension) - assert pipeline.transformations[1].horizontal.size == 'KLON' - assert pipeline.transformations[1].horizontal.index == 'JL' - assert pipeline.transformations[1].directive == 'openacc' - assert pipeline.transformations[2].trim_vector_sections is True - assert pipeline.transformations[6].replace_ignore_items is True + assert isinstance(pipeline.transformations[2].horizontal, Dimension) + assert pipeline.transformations[2].horizontal.size == 'KLON' + assert pipeline.transformations[2].horizontal.index == 'JL' + assert pipeline.transformations[2].directive == 'openacc' + assert pipeline.transformations[3].trim_vector_sections is True + assert pipeline.transformations[7].replace_ignore_items is True diff --git a/loki/transformations/single_column/__init__.py b/loki/transformations/single_column/__init__.py index d5fe934d1..2ed513c63 100644 --- a/loki/transformations/single_column/__init__.py +++ b/loki/transformations/single_column/__init__.py @@ -13,3 +13,4 @@ from loki.transformations.single_column.scc_cuf import * # noqa from loki.transformations.single_column.vector import * # noqa from loki.transformations.single_column.scc_low_level import * # noqa +from loki.transformations.single_column.vertical import * # noqa diff --git a/loki/transformations/single_column/scc.py b/loki/transformations/single_column/scc.py index 3b7f7d7dc..001ddca14 100644 --- a/loki/transformations/single_column/scc.py +++ b/loki/transformations/single_column/scc.py @@ -19,7 +19,7 @@ from loki.transformations.single_column.vector import ( SCCDevectorTransformation, SCCDemoteTransformation, SCCRevectorTransformation ) - +from loki.transformations.single_column.vertical import SCCFuseVerticalLoops __all__ = [ 'SCCVectorPipeline', 'SCCHoistPipeline', 'SCCStackPipeline', 'SCCRawStackPipeline' @@ -75,6 +75,7 @@ """ SCCVectorPipeline = partial( Pipeline, classes=( + SCCFuseVerticalLoops, SCCBaseTransformation, SCCDevectorTransformation, SCCDemoteTransformation, @@ -121,6 +122,7 @@ """ SCCHoistPipeline = partial( Pipeline, classes=( + SCCFuseVerticalLoops, SCCBaseTransformation, SCCDevectorTransformation, SCCDemoteTransformation, @@ -166,6 +168,7 @@ """ SCCStackPipeline = partial( Pipeline, classes=( + SCCFuseVerticalLoops, SCCBaseTransformation, SCCDevectorTransformation, SCCDemoteTransformation, diff --git a/loki/transformations/single_column/tests/test_scc_vertical.py b/loki/transformations/single_column/tests/test_scc_vertical.py new file mode 100644 index 000000000..3a476a882 --- /dev/null +++ b/loki/transformations/single_column/tests/test_scc_vertical.py @@ -0,0 +1,255 @@ +# (C) Copyright 2018- ECMWF. +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +import pytest + +from loki import Subroutine, Dimension +from loki.frontend import available_frontends +from loki.ir import FindNodes, Loop, FindVariables +from loki.transformations.single_column import SCCFuseVerticalLoops + + +@pytest.fixture(scope='module', name='horizontal') +def fixture_horizontal(): + return Dimension( + name='horizontal', size='nlon', index='jl', + bounds=('start', 'end'), aliases=('nproma',) + ) + +@pytest.fixture(scope='module', name='horizontal_bounds_aliases') +def fixture_horizontal_bounds_aliases(): + return Dimension( + name='horizontal_bounds_aliases', size='nlon', index='jl', + bounds=('start', 'end'), aliases=('nproma',), + bounds_aliases=('bnds%start', 'bnds%end') + ) + +@pytest.fixture(scope='module', name='vertical') +def fixture_vertical(): + return Dimension(name='vertical', size='nz', index='jk', aliases=('nlev',)) + +@pytest.fixture(scope='module', name='blocking') +def fixture_blocking(): + return Dimension(name='blocking', size='nb', index='b') + + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_simple_scc_fuse_verticals_transformation(frontend, horizontal, vertical): + """ + Test simple example of vertical loop fusion and demotion of temporaries. + """ + + fcode_kernel = """ + SUBROUTINE compute_column(start, end, nlon, nz, q, t) + INTEGER, INTENT(IN) :: start, end ! Iteration indices + INTEGER, INTENT(IN) :: nlon, nz ! Size of the horizontal and vertical + REAL, INTENT(INOUT) :: t(nlon,nz) + REAL, INTENT(INOUT) :: q(nlon,nz) + REAL :: temp_t(nlon, nz) + REAL :: temp_q(nlon, nz) + INTEGER :: jl, JK + REAL :: c + + c = 5.345 + !$loki loop-fusion group(1) + DO JK = 1, nz + DO jl = start, end + temp_t(jl, jk) = c + temp_q(jl, JK) = c + END DO + END DO + + !$loki loop-fusion group(1) + DO jk = 2, nz + DO jl = start, end + t(jl, jk) = temp_t(jl, jk) * jk + q(jl, jk) = q(jl, jk-1) + t(jl, jk) * temp_q(jl, jk) + END DO + END DO + + ! The scaling is purposefully upper-cased + DO JL = START, END + Q(JL, NZ) = Q(JL, NZ) * C + END DO + END SUBROUTINE compute_column +""" + kernel = Subroutine.from_source(fcode_kernel, frontend=frontend) + + # Ensure we have three loops in the kernel prior to transformation + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 5 + + # no-op as 'compute_column' is not within apply_to + SCCFuseVerticalLoops(vertical=vertical, apply_to=('another_kernel',)).apply(kernel, role='kernel') + # Ensure we have three loops in the kernel prior to transformation + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 5 + + # actual loop fusion and demotion ... (as apply_to is not provided and therefore all routines are dispatched) + SCCFuseVerticalLoops(vertical=vertical).apply(kernel, role='kernel') + + # Ensure the two vertical loops are fused + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 4 + assert kernel_loops[0].variable.name.lower() == 'jk' + assert kernel_loops[-1].variable.name.lower() == 'jl' + assert len([loop for loop in kernel_loops if loop.variable.name.lower() == 'jk']) == 1 + kernel_var_map = kernel.variable_map + assert kernel_var_map['temp_t'].shape == (horizontal.size,) + assert kernel_var_map['temp_q'].shape == (horizontal.size,) + kernel_vars = [var for var in FindVariables().visit(kernel.body) if var.name.lower() in ['temp_t', 'temp_q']] + for var in kernel_vars: + assert var.shape == (horizontal.size,) + assert var.dimensions == (horizontal.index,) + + +@pytest.mark.parametrize('frontend', available_frontends()) +@pytest.mark.parametrize('ignore', (False, True)) +def test_scc_fuse_verticals_transformation(frontend, horizontal, vertical, ignore): + """ + Test somewhat more sophisticated example of vertical loop fusion + and demotion of temporaries. + """ + + fcode_kernel = f""" + SUBROUTINE compute_column(start, end, nlon, nz, q, t) + INTEGER, INTENT(IN) :: start, end ! Iteration indices + INTEGER, INTENT(IN) :: nlon, nz ! Size of the horizontal and vertical + REAL, INTENT(INOUT) :: t(nlon,nz) + REAL, INTENT(INOUT) :: q(nlon,nz) + REAL :: temp_t(nlon, nz) + REAL :: temp_t2(nlon, nz) + REAL :: temp_q(nlon, nz) + REAL :: temp_q2(nlon, nz) + REAL :: temp_cld(nlon, nz, 5) + INTEGER :: jl, jk, jm + REAL :: c + + {'!$loki k-caching ignore(temp_q2)' if ignore else ''} + + c = 5.345 + !$loki loop-fusion group(1-init) + DO jk = 1, nz + DO jl = start, end + temp_t(jl, jk) = c + temp_q(jl, jk) = c + temp_t2(jl, jk) = 2*c + END DO + END DO + + !$loki loop-fusion group(1) + !$loki loop-interchange + DO jm=1,5 + DO jk = 1, nz + DO jl = start, end + temp_cld(jl, jk, jm) = 3.1415 + END DO + END DO + END DO + + DO jl = start, end + q(jl, jk) = 0. + END DO + + !$loki loop-fusion group(1) insert + DO jk = 2, nz + DO jl = start, end + t(jl, jk) = temp_t(jl, jk) * temp_t2(jl, jk-1) * temp_cld(jl, jk, 1) + q(jl, jk) = q(jl, jk-1) + t(jl, jk) * temp_q(jl, jk) + END DO + END DO + + CALL nested_kernel(start, end, nlon, nz, q) + + !$loki loop-fusion group(2) + DO jk = 2, nz + DO jl = start, end + temp_q2(jl, jk) = 3.1415 + END DO + END DO + + !$loki loop-fusion group(2) + DO jk = 2, nz + DO jl = start, end + t(jl, jk) = t(jl, jk) + 3.1415 + q(jl, jk) = q(jl, jk-1) + t(jl, jk) * temp_q(jl, jk) + temp_q2(jl, jk) + END DO + END DO + + ! The scaling is purposefully upper-cased + DO JL = START, END + Q(JL, NZ) = Q(JL, NZ) * C + END DO + END SUBROUTINE compute_column +""" + + + kernel = Subroutine.from_source(fcode_kernel, frontend=frontend) + + # Ensure we have three loops in the kernel prior to transformation + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 13 + SCCFuseVerticalLoops(vertical=vertical).apply(kernel, role='kernel') + + # Ensure the two vertical loops are fused + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 12 + vertical_loops = [loop for loop in kernel_loops if loop.variable.name.lower() == vertical.index] + assert len(vertical_loops) == 3 + + shape1D = (horizontal.size,) + shape2D = (horizontal.size, vertical.size) + dimension1D = (horizontal.index,) + dimension2D = (horizontal.index,vertical.index) + dimension2DI1 = (horizontal.index, f'{vertical.index}-1') + + vertical_loop_0_vars = FindVariables().visit(vertical_loops[0].body) + vertical_loop_0_var_names = [var.name.lower() for var in vertical_loop_0_vars] + vertical_loop_0_var_dict = dict(zip(vertical_loop_0_var_names, vertical_loop_0_vars)) + assert 'temp_t2' in vertical_loop_0_var_names + assert 'temp_t' not in vertical_loop_0_var_names + assert 'temp_q' not in vertical_loop_0_var_names + assert 'temp_q2' not in vertical_loop_0_var_names + assert 'temp_cld' not in vertical_loop_0_var_names + assert vertical_loop_0_var_dict['temp_t2'].shape == shape2D + assert vertical_loop_0_var_dict['temp_t2'].dimensions == dimension2D + + vertical_loop_1_vars = FindVariables().visit(vertical_loops[1].body) + vertical_loop_1_var_names = [var.name.lower() for var in vertical_loop_1_vars] + vertical_loop_1_var_dict = dict(zip(vertical_loop_1_var_names, vertical_loop_1_vars)) + assert 'temp_t2' in vertical_loop_1_var_names + assert 'temp_t' in vertical_loop_1_var_names + assert 'temp_q' in vertical_loop_1_var_names + assert 'temp_q2' not in vertical_loop_1_vars + assert 'temp_cld' in vertical_loop_1_var_names + assert vertical_loop_1_var_dict['temp_t2'].shape == shape2D + assert vertical_loop_1_var_dict['temp_t2'].dimensions == dimension2DI1 + assert vertical_loop_1_var_dict['temp_t'].shape == shape1D + assert vertical_loop_1_var_dict['temp_t'].dimensions == dimension1D + assert vertical_loop_1_var_dict['temp_q'].shape == shape2D + assert vertical_loop_1_var_dict['temp_q'].dimensions == dimension2D + assert vertical_loop_1_var_dict['temp_cld'].shape == shape1D + (5,) + assert vertical_loop_1_var_dict['temp_cld'].dimensions in (dimension1D + (1,), dimension1D + ('jm',)) + + vertical_loop_2_vars = FindVariables().visit(vertical_loops[2].body) + vertical_loop_2_var_names = [var.name.lower() for var in vertical_loop_2_vars] + vertical_loop_2_var_dict = dict(zip(vertical_loop_2_var_names, vertical_loop_2_vars)) + assert 'temp_t2' not in vertical_loop_2_var_names + assert 'temp_t' not in vertical_loop_2_var_names + assert 'temp_q' in vertical_loop_2_var_names + assert 'temp_q2' in vertical_loop_2_var_names + assert 'temp_cld' not in vertical_loop_2_var_names + assert vertical_loop_2_var_dict['temp_q'].shape == shape2D + assert vertical_loop_2_var_dict['temp_q'].dimensions == dimension2D + assert vertical_loop_2_var_dict['temp_q2'].shape == shape2D if ignore else shape1D + assert vertical_loop_2_var_dict['temp_q2'].dimensions == dimension2D if ignore else dimension1D + + kernel_var_map = kernel.variable_map + assert kernel_var_map['temp_t'].shape == shape1D + assert kernel_var_map['temp_t2'].shape == shape2D + assert kernel_var_map['temp_q'].shape == shape2D + assert kernel_var_map['temp_q2'].shape == shape2D if ignore else shape1D diff --git a/loki/transformations/single_column/vertical.py b/loki/transformations/single_column/vertical.py new file mode 100644 index 000000000..d19f11818 --- /dev/null +++ b/loki/transformations/single_column/vertical.py @@ -0,0 +1,224 @@ +# (C) Copyright 2018- ECMWF. +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +from loki.batch import Transformation +from loki.expression import ( + symbols as sym +) +from loki.ir import ( + nodes as ir, FindNodes, Transformer, + is_loki_pragma, pragmas_attached, + get_pragma_parameters, FindVariables +) +from loki.tools import as_tuple, CaseInsensitiveDict +from loki.transformations.transform_loop import loop_fusion, loop_interchange +from loki.transformations.array_indexing import demote_variables +from loki.transformations.utilities import get_local_arrays +from loki.logging import info + +__all__ = ['SCCFuseVerticalLoops'] + +class SCCFuseVerticalLoops(Transformation): + """ + A transformation to fuse vertical loops and demote temporaries in the vertical + dimension if possible. + + .. note:: + This transfomation currently relies on pragmas being inserted in the input + source files. Relevant pragmas are `!$loki loop-interchange` to expose the + vertical loops (in case vertical loops are nested) and `!$loki loop-fusion` + possibly grouped via `group()`. Further, if there are loops + that initialize multilevel arrays (`jk +/- 1`) it is possible to mark those + loops as `!$loki loop-fusion group(-init)`. This allows to split + the relevant node and moves the initialization of those arrays to the top of + the group. + + Parameters + ---------- + vertical : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the vertical data dimension and iteration space. + apply_to : list of str, optional + list of routines to apply this transformation to, if not provided or None + apply to all routines (default: None) + """ + + def __init__(self, vertical=None, apply_to=None): + self.vertical = vertical + self.apply_to = apply_to or () + + def transform_subroutine(self, routine, **kwargs): + """ + Fuse vertical loops and demote temporaries in the vertical dimension + if possible. + + Parameters + ---------- + routine : :any:`Subroutine` + The subroutine in the vertical loops should be fused and + temporaries be demoted. + """ + if self.vertical is None: + info('[SCCFuseVerticalLoops] is not applied as the vertical dimension is not defined!') + return + role = kwargs['role'] + if role == 'kernel': + if self.apply_to and routine.name.lower() not in self.apply_to: + return + self.process_kernel(routine) + + def process_kernel(self, routine): + """ + Current logic (simplified): + + 1. loop interchange to expose vertical loops + 2. fuse vertical loops (possibly into multiple groups) + 3. find local arrays to be demoted and apply heuristics to check whether this is safe + 4. demote those arrays which are safe to be demoted + """ + # find local arrays with a vertical dimension + relevant_local_arrays = self.find_relevant_local_arrays(routine) + # find "multilevel" thus "jk +/- 1" arrays + multilevel_relevant_local_arrays = self.identify_multilevel_arrays(relevant_local_arrays) + # loop interchange to expose vertical loops as outermost loops + loop_interchange(routine) + # handle initialization of arrays "jk +/- 1" arrays + multilevel_relevant_local_arrays_names = set(arr.name.lower() for arr in multilevel_relevant_local_arrays) + self.correct_init_of_multilevel_arrays(routine, multilevel_relevant_local_arrays_names) + # fuse vertical loops + loop_fusion(routine) + # demote in vertical dimension if possible + relevant_local_arrays_names = set(arr.name.lower() for arr in relevant_local_arrays) + demote_candidates = relevant_local_arrays_names - multilevel_relevant_local_arrays_names + # check which variables are safe to demote in the vertical + safe_to_demote = self.check_safe_to_demote(routine, demote_candidates) + # demote locals in vertical dimension + dimensions_to_demote = self.vertical.size_expressions + (f"{self.vertical.size}+1",) + demote_variables(routine, safe_to_demote, dimensions_to_demote) + + def check_safe_to_demote(self, routine, demote_candidates): + """ + Check whether variables that are candidates to be demoted in the vertical dimension are really + safe to be demoted. + + Current heuristic: If the candidate is used in more than one vertical loop, assume it is NOT safe + to demote! + """ + loop_var_map = CaseInsensitiveDict() + with pragmas_attached(routine, ir.Loop): + for loop in FindNodes(ir.Loop).visit(routine.body): + if loop.variable == self.vertical.index: + if is_loki_pragma(loop.pragma, starts_with='fused-loop'): + parameters = get_pragma_parameters(loop.pragma, starts_with='fused-loop') + group = parameters.get('group', 'default') + if group == 'ignore': + continue + for var in FindVariables().visit(loop.body): + if isinstance(var, sym.Array): + loop_var_map.setdefault(var.name, set()).add(group) + + safe_to_demote = () + for var in demote_candidates: + if var in loop_var_map and len(loop_var_map[var]) <= 1: + safe_to_demote += (var,) + + return safe_to_demote + + def find_relevant_local_arrays(self, routine): + """ + Find local arrays/temporaries that do have the vertical dimension. + """ + # local/temporary arrays + local_arrays = get_local_arrays(routine, routine.body) + # only those with the vertical size within shape + relevant_local_arrays = [arr for arr in local_arrays if self.vertical.size.lower() + in FindVariables().visit(arr.shape)] + # filter arrays to be ignored (for whatever reason) + ignore_names = self.find_local_arrays_to_be_ignored(routine) + if ignore_names: + relevant_local_arrays = [arr for arr in relevant_local_arrays if arr.name.lower() not in ignore_names] + return relevant_local_arrays + + def find_local_arrays_to_be_ignored(self, routine): + """ + Identify variables to be ignore regarding demotion for whatever reason. + + Reasons are: + + * explicitly marked to be ignored via pragmas within the input source file, e.g., + 'loki k-caching ignore(var1, var2, ...)' + """ + ignore = () + pragmas = FindNodes(ir.Pragma).visit(routine.body) + # look for 'loki k-caching ignore(var1, var2, ...)' pragmas within routine and ignore those vars + for pragma in pragmas: + if is_loki_pragma(pragma, starts_with='k-caching'): + if pragma_ignore := get_pragma_parameters(pragma, starts_with='k-caching').get('ignore', None): + ignore += as_tuple(v.strip() for v in pragma_ignore.split(',')) + ignore_names = set(var.lower() for var in ignore) + return ignore_names + + def identify_multilevel_arrays(self, local_arrays): + """ + Identify local arrays/temporaries that have an access in the vertical dimension + that is different to '', e.g., ' +/- 1' + """ + multilevel_local_arrays = [] + for arr in local_arrays: + for dim in arr.dimensions: + if self.vertical.index in FindVariables().visit(dim): + # dim is not equal to vertical.index e.g., vertical.index +/- 1 + if dim != self.vertical.index: + multilevel_local_arrays.append(arr) + return multilevel_local_arrays + + def correct_init_of_multilevel_arrays(self, routine, multilevel_local_arrays): + """ + Possibly handle initialization of those multilevel local arrays via + splitting relevant loops or rather creating a new node with the relevant + nodes moved to the newly created loop. + + .. note:: + This relies on pragmas being inserted in the input source code! + """ + loop_map = {} + # find/identify loops with pragma 'loop-fusion group(-init)' + with pragmas_attached(routine, ir.Loop): + loop_map = {} + for loop in FindNodes(ir.Loop).visit(routine.body): + if is_loki_pragma(loop.pragma, starts_with='loop-fusion'): + parameters = get_pragma_parameters(loop.pragma, starts_with='loop-fusion') + group = parameters.get('group', 'default') + if group.endswith('-init'): + nodes_to_be_moved = () + nodes = FindNodes(ir.Assignment).visit(loop.body) + node_map = {} + node_map_init = {} + # find nodes that have multilevel arrays + for node in nodes: + node_vars = FindVariables().visit(node) + if any(node_var.name.lower() in multilevel_local_arrays for node_var in node_vars): + nodes_to_be_moved += (node,) + node_map[node] = None + else: + node_map_init[node] = None + # split the loop/create a new node to move those nodes with + # multilevel arrays to the new node + if nodes_to_be_moved: + pragmas = loop.pragma + new_pragmas = [pragma.clone(content=pragma.content.replace('-init', '')) if '-init' + in pragma.content else pragma for pragma in pragmas] + # init part + transf_init = Transformer(node_map_init).visit(loop.clone(\ + pragma=as_tuple(ir.Pragma(keyword='loki', + content='fused-loop group(ignore)')))) + # rest of the original node/loop + transf_orig = Transformer(node_map).visit(loop.clone(pragma=as_tuple(new_pragmas))) + loop_map[loop] = (ir.Comment('! Loki generated loop for init ...'), + transf_init, transf_orig) + if loop_map: + routine.body = Transformer(loop_map).visit(routine.body) diff --git a/loki/transformations/tests/test_transform_loop.py b/loki/transformations/tests/test_transform_loop.py index a2209f4c3..b7e04c607 100644 --- a/loki/transformations/tests/test_transform_loop.py +++ b/loki/transformations/tests/test_transform_loop.py @@ -13,10 +13,10 @@ from loki import Subroutine from loki.build import jit_compile, clean_test from loki.expression import symbols as sym -from loki.frontend import available_frontends, OMNI +from loki.frontend import available_frontends from loki.ir import ( is_loki_pragma, pragmas_attached, FindNodes, Loop, Conditional, - Assignment + Assignment, FindVariables ) from loki.transformations.transform_loop import ( @@ -215,6 +215,59 @@ def test_transform_loop_interchange_project(tmp_path, frontend): clean_test(interchanged_filepath) +@pytest.mark.parametrize('frontend', available_frontends()) +@pytest.mark.parametrize('insert_loc', (False, True)) +def test_transform_loop_fuse_ordering(frontend, insert_loc): + """ + Apply loop fusion for two loops with matching iteration spaces. + """ + fcode = f""" +subroutine transform_loop_fuse_ordering(a, b, c, n, m) + integer, intent(out) :: a(m, n), b(m, n), c(m) + integer, intent(in) :: n, m + integer :: i + + !$loki loop-fusion group(1) + !$loki loop-interchange + do j=1,m + do i=1,n + a(j, i) = i + j + enddo + end do + + !$loki loop-fusion group(1) + do i=1,n + do j=1,m + a(j, i) = i + j + enddo + end do + + do j=1,m + c(j) = j + enddo + + !$loki loop-fusion group(1) {'insert-loc' if insert_loc else ''} + do i=1,n-1 + do j=1,m + b(j, i) = n-i+1 + j + enddo + end do +end subroutine transform_loop_fuse_ordering +""" + routine = Subroutine.from_source(fcode, frontend=frontend) + assert len(FindNodes(Loop).visit(routine.body)) == 7 + loop_interchange(routine) + loop_fusion(routine) + loops = FindNodes(Loop).visit(routine.body) + assert len(loops) == 5 + loop_0_vars = [var.name.lower() for var in FindVariables().visit(loops[0].body)] + if insert_loc: + assert loops[0].variable.name.lower() == 'j' + assert 'c' in loop_0_vars + else: + assert loops[0].variable.name.lower() == 'i' + assert 'c' not in loop_0_vars + @pytest.mark.parametrize('frontend', available_frontends()) def test_transform_loop_fuse_matching(tmp_path, frontend): """ @@ -1631,7 +1684,7 @@ def test_transform_loop_unroll(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([x + 1 for x in range(1, 11)]) + assert s == sum(x + 1 for x in range(1, 11)) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 1 @@ -1644,7 +1697,7 @@ def test_transform_loop_unroll(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([x + 1 for x in range(1, 11)]) + assert s == sum(x + 1 for x in range(1, 11)) clean_test(filepath) clean_test(unrolled_filepath) @@ -1673,7 +1726,7 @@ def test_transform_loop_unroll_step(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([x + 1 for x in range(1, 11, 2)]) + assert s == sum(x + 1 for x in range(1, 11, 2)) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 1 @@ -1686,7 +1739,7 @@ def test_transform_loop_unroll_step(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([x + 1 for x in range(1, 11, 2)]) + assert s == sum(x + 1 for x in range(1, 11, 2)) clean_test(filepath) clean_test(unrolled_filepath) @@ -1717,7 +1770,7 @@ def test_transform_loop_unroll_non_literal_range(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([x + 1 for x in range(1, 11)]) + assert s == sum(x + 1 for x in range(1, 11)) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 1 @@ -1730,7 +1783,7 @@ def test_transform_loop_unroll_non_literal_range(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([x + 1 for x in range(1, 11)]) + assert s == sum(x + 1 for x in range(1, 11)) clean_test(filepath) clean_test(unrolled_filepath) @@ -1762,7 +1815,7 @@ def test_transform_loop_unroll_nested(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 2 @@ -1775,7 +1828,7 @@ def test_transform_loop_unroll_nested(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) clean_test(filepath) clean_test(unrolled_filepath) @@ -1807,7 +1860,7 @@ def test_transform_loop_unroll_nested_restricted_depth(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 2 @@ -1820,7 +1873,7 @@ def test_transform_loop_unroll_nested_restricted_depth(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) clean_test(filepath) clean_test(unrolled_filepath) @@ -1854,7 +1907,7 @@ def test_transform_loop_unroll_nested_restricted_depth_unrollable(tmp_path, fron # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 2 @@ -1867,7 +1920,7 @@ def test_transform_loop_unroll_nested_restricted_depth_unrollable(tmp_path, fron # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) clean_test(filepath) clean_test(unrolled_filepath) @@ -1915,7 +1968,7 @@ def test_transform_loop_unroll_nested_counters(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 11)) if b <= a]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 11)) if b <= a) clean_test(filepath) clean_test(unrolled_filepath) @@ -1953,7 +2006,7 @@ def test_transform_loop_unroll_nested_neighbours(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == 2 * sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == 2 * sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 3 loop_unroll(routine) @@ -1965,7 +2018,7 @@ def test_transform_loop_unroll_nested_neighbours(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == 2 * sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == 2 * sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) clean_test(filepath) clean_test(unrolled_filepath) diff --git a/loki/transformations/transform_loop.py b/loki/transformations/transform_loop.py index a6f788522..29299b4eb 100644 --- a/loki/transformations/transform_loop.py +++ b/loki/transformations/transform_loop.py @@ -294,140 +294,147 @@ def loop_fusion(routine): group = parameters.get('group', 'default') fusion_groups[group] += [(loop, parameters)] - if not fusion_groups: - return - - # Merge loops in each group and put them in the position of the group's first loop - for group, loop_parameter_lists in fusion_groups.items(): - loop_list, parameters = zip(*loop_parameter_lists) - - # First, determine the collapse depth and extract user-annotated loop ranges from pragmas - collapse = [param.get('collapse', None) for param in parameters] - if collapse != [collapse[0]] * len(collapse): - raise RuntimeError(f'Conflicting collapse values in group "{group}"') - collapse = int(collapse[0]) if collapse[0] is not None else 1 - - pragma_ranges = [pragma_ranges_to_loop_ranges(param, routine) for param in parameters] - - # If we have a pragma somewhere with an explicit loop range, we use that for the fused loop - range_set = {r for r in pragma_ranges if r is not None} - if len(range_set) not in (0, 1): - raise RuntimeError(f'Pragma-specified loop ranges in group "{group}" do not match') - - fusion_ranges = None - if range_set: - fusion_ranges = range_set.pop() - - # Next, extract loop ranges for all loops in group and convert to iteration space - # polyhedrons for easier alignment - loop_variables, loop_ranges, loop_bodies = \ - zip(*[get_loop_components(get_nested_loops(loop, collapse)) for loop in loop_list]) - iteration_spaces = [Polyhedron.from_loop_ranges(variables, ranges) - for variables, ranges in zip(loop_variables, loop_ranges)] - - # Find the fused iteration space (if not given by a pragma) - if fusion_ranges is None: - fusion_ranges = [] - for level in range(collapse): - lower_bounds, upper_bounds = [], [] - ignored_variables = list(range(level+1, collapse)) - - for p in iteration_spaces: - for bound in p.lower_bounds(level, ignored_variables): - # Decide if we learn something new from this bound, which could be because: - # (1) we don't have any bounds, yet - # (2) bound is smaller than existing lower bounds (i.e. diff < 0) - # (3) bound is not constant and none of the existing bounds are lower (i.e. diff >= 0) - diff = [simplify(bound - b) for b in lower_bounds] - is_any_negative = any(is_constant(d) and symbolic_op(d, op.lt, 0) for d in diff) - is_any_not_negative = any(is_constant(d) and symbolic_op(d, op.ge, 0) for d in diff) - is_new_bound = (not lower_bounds or is_any_negative or - (not is_constant(bound) and not is_any_not_negative)) - if is_new_bound: - # Remove any lower bounds made redundant by bound: - lower_bounds = [b for b, d in zip(lower_bounds, diff) - if not (is_constant(d) and symbolic_op(d, op.lt, 0))] - lower_bounds += [bound] - - for bound in p.upper_bounds(level, ignored_variables): - # Decide if we learn something new from this bound, which could be because: - # (1) we don't have any bounds, yet - # (2) bound is larger than existing upper bounds (i.e. diff > 0) - # (3) bound is not constant and none of the existing bounds are larger (i.e. diff <= 0) - diff = [simplify(bound - b) for b in upper_bounds] - is_any_positive = any(is_constant(d) and symbolic_op(d, op.gt, 0) for d in diff) - is_any_not_positive = any(is_constant(d) and symbolic_op(d, op.le, 0) for d in diff) - is_new_bound = (not upper_bounds or is_any_positive or - (not is_constant(bound) and not is_any_not_positive)) - if is_new_bound: - # Remove any lower bounds made redundant by bound: - upper_bounds = [b for b, d in zip(upper_bounds, diff) - if not (is_constant(d) and symbolic_op(d, op.gt, 0))] - upper_bounds += [bound] - - if len(lower_bounds) == 1: - lower_bounds = lower_bounds[0] - else: - fct_symbol = sym.ProcedureSymbol('min', scope=routine) - lower_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(lower_bounds)) - - if len(upper_bounds) == 1: - upper_bounds = upper_bounds[0] - else: - fct_symbol = sym.ProcedureSymbol('max', scope=routine) - upper_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(upper_bounds)) - - fusion_ranges += [sym.LoopRange((lower_bounds, upper_bounds))] - - # Align loop ranges and collect bodies - fusion_bodies = [] - fusion_variables = loop_variables[0] - for idx, (variables, ranges, bodies, p) in enumerate( - zip(loop_variables, loop_ranges, loop_bodies, iteration_spaces)): - # TODO: This throws away anything that is not in the inner-most loop body. - body = flatten([Comment(f'! Loki loop-fusion - body {idx} begin'), - bodies[-1], - Comment(f'! Loki loop-fusion - body {idx} end')]) - - # Replace loop variables if necessary - var_map = {} - for loop_variable, fusion_variable in zip(variables, fusion_variables): - if loop_variable != fusion_variable: - var_map.update({var: fusion_variable for var in FindVariables().visit(body) - if var.name.lower() == loop_variable.name}) - if var_map: - body = SubstituteExpressions(var_map).visit(body) - - # Wrap in conditional if loop bounds are different - conditions = [] - for loop_range, fusion_range, variable in zip(ranges, fusion_ranges, fusion_variables): - if symbolic_op(loop_range.start, op.ne, fusion_range.start): - conditions += [sym.Comparison(variable, '>=', loop_range.start)] - if symbolic_op(loop_range.stop, op.ne, fusion_range.stop): - conditions += [sym.Comparison(variable, '<=', loop_range.stop)] - if conditions: - if len(conditions) == 1: - condition = conditions[0] - else: - condition = sym.LogicalAnd(as_tuple(conditions)) - body = Conditional(condition=condition, body=as_tuple(body), else_body=()) + if not fusion_groups: + return + + # Merge loops in each group and put them in the position of the group's first loop + # UNLESS 'insert-loc' location is specified for at least one of the group's fusion + # pragmas, in this case the position is the first occurence of 'insert-loc' for each group + for group, loop_parameter_lists in fusion_groups.items(): + loop_list, parameters = zip(*loop_parameter_lists) + + # First, determine the collapse depth and extract user-annotated loop ranges from pragmas + collapse = [param.get('collapse', None) for param in parameters] + insert_locs = [param.get('insert-loc', False) for param in parameters] + if collapse != [collapse[0]] * len(collapse): + raise RuntimeError(f'Conflicting collapse values in group "{group}"') + collapse = int(collapse[0]) if collapse[0] is not None else 1 + + pragma_ranges = [pragma_ranges_to_loop_ranges(param, routine) for param in parameters] + + # If we have a pragma somewhere with an explicit loop range, we use that for the fused loop + range_set = {r for r in pragma_ranges if r is not None} + if len(range_set) not in (0, 1): + raise RuntimeError(f'Pragma-specified loop ranges in group "{group}" do not match') + + fusion_ranges = None + if range_set: + fusion_ranges = range_set.pop() + + # Next, extract loop ranges for all loops in group and convert to iteration space + # polyhedrons for easier alignment + loop_variables, loop_ranges, loop_bodies = \ + zip(*[get_loop_components(get_nested_loops(loop, collapse)) for loop in loop_list]) + iteration_spaces = [Polyhedron.from_loop_ranges(variables, ranges) + for variables, ranges in zip(loop_variables, loop_ranges)] + + # Find the fused iteration space (if not given by a pragma) + if fusion_ranges is None: + fusion_ranges = [] + for level in range(collapse): + lower_bounds, upper_bounds = [], [] + ignored_variables = list(range(level+1, collapse)) + + for p in iteration_spaces: + for bound in p.lower_bounds(level, ignored_variables): + # Decide if we learn something new from this bound, which could be because: + # (1) we don't have any bounds, yet + # (2) bound is smaller than existing lower bounds (i.e. diff < 0) + # (3) bound is not constant and none of the existing bounds are lower (i.e. diff >= 0) + diff = [simplify(bound - b) for b in lower_bounds] + is_any_negative = any(is_constant(d) and symbolic_op(d, op.lt, 0) for d in diff) + is_any_not_negative = any(is_constant(d) and symbolic_op(d, op.ge, 0) for d in diff) + is_new_bound = (not lower_bounds or is_any_negative or + (not is_constant(bound) and not is_any_not_negative)) + if is_new_bound: + # Remove any lower bounds made redundant by bound: + lower_bounds = [b for b, d in zip(lower_bounds, diff) + if not (is_constant(d) and symbolic_op(d, op.lt, 0))] + lower_bounds += [bound] + + for bound in p.upper_bounds(level, ignored_variables): + # Decide if we learn something new from this bound, which could be because: + # (1) we don't have any bounds, yet + # (2) bound is larger than existing upper bounds (i.e. diff > 0) + # (3) bound is not constant and none of the existing bounds are larger (i.e. diff <= 0) + diff = [simplify(bound - b) for b in upper_bounds] + is_any_positive = any(is_constant(d) and symbolic_op(d, op.gt, 0) for d in diff) + is_any_not_positive = any(is_constant(d) and symbolic_op(d, op.le, 0) for d in diff) + is_new_bound = (not upper_bounds or is_any_positive or + (not is_constant(bound) and not is_any_not_positive)) + if is_new_bound: + # Remove any lower bounds made redundant by bound: + upper_bounds = [b for b, d in zip(upper_bounds, diff) + if not (is_constant(d) and symbolic_op(d, op.gt, 0))] + upper_bounds += [bound] - fusion_bodies += [body] - - # Create the nested fused loop and replace original loops - fusion_loop = flatten(fusion_bodies) - for fusion_variable, fusion_range in zip(reversed(fusion_variables), reversed(fusion_ranges)): - fusion_loop = Loop(variable=fusion_variable, body=as_tuple(fusion_loop), bounds=fusion_range) + if len(lower_bounds) == 1: + lower_bounds = lower_bounds[0] + else: + # TODO: could/should be ProcedureSymbol, however refer to issue: #390 + fct_symbol = sym.DeferredTypeSymbol(name='min', scope=routine) + lower_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(lower_bounds)) - comment = Comment(f'! Loki loop-fusion group({group})') - loop_map[loop_list[0]] = (comment, fusion_loop) - comment = Comment(f'! Loki loop-fusion group({group}) - loop hoisted') - loop_map.update({loop: comment for loop in loop_list[1:]}) + if len(upper_bounds) == 1: + upper_bounds = upper_bounds[0] + else: + # TODO: could/should be ProcedureSymbol, however refer to issue: #390 + fct_symbol = sym.DeferredTypeSymbol(name='max', scope=routine) + upper_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(upper_bounds)) - # Apply transformation - routine.body = Transformer(loop_map).visit(routine.body) - info('%s: fused %d loops in %d groups.', routine.name, - sum(len(loop_list) for loop_list in fusion_groups.values()), len(fusion_groups)) + fusion_ranges += [sym.LoopRange((lower_bounds, upper_bounds))] + + # Align loop ranges and collect bodies + fusion_bodies = [] + fusion_variables = loop_variables[0] + for idx, (variables, ranges, bodies, p) in enumerate( + zip(loop_variables, loop_ranges, loop_bodies, iteration_spaces)): + # TODO: This throws away anything that is not in the inner-most loop body. + body = flatten([Comment(f'! Loki loop-fusion - body {idx} begin'), + bodies[-1], + Comment(f'! Loki loop-fusion - body {idx} end')]) + + # Replace loop variables if necessary + var_map = {} + for loop_variable, fusion_variable in zip(variables, fusion_variables): + if loop_variable != fusion_variable: + var_map.update({var: fusion_variable for var in FindVariables().visit(body) + if var.name.lower() == loop_variable.name}) + if var_map: + body = SubstituteExpressions(var_map).visit(body) + + # Wrap in conditional if loop bounds are different + conditions = [] + for loop_range, fusion_range, variable in zip(ranges, fusion_ranges, fusion_variables): + if symbolic_op(loop_range.start, op.ne, fusion_range.start): + conditions += [sym.Comparison(variable, '>=', loop_range.start)] + if symbolic_op(loop_range.stop, op.ne, fusion_range.stop): + conditions += [sym.Comparison(variable, '<=', loop_range.stop)] + if conditions: + if len(conditions) == 1: + condition = conditions[0] + else: + condition = sym.LogicalAnd(as_tuple(conditions)) + body = Conditional(condition=condition, body=as_tuple(body), else_body=()) + + fusion_bodies += [body] + + # Create the nested fused loop and replace original loops + fusion_loop = flatten(fusion_bodies) + for fusion_variable, fusion_range in zip(reversed(fusion_variables), reversed(fusion_ranges)): + fusion_loop = Loop(variable=fusion_variable, body=as_tuple(fusion_loop), bounds=fusion_range) + + comment = Comment(f'! Loki loop-fusion group({group})') + insert_loc = insert_locs.index(None) if None in insert_locs else 0 + loop_map[loop_list[insert_loc]] = (comment, Pragma(keyword='loki', + content=f'fused-loop group({group})'), fusion_loop) + comment = Comment(f'! Loki loop-fusion group({group}) - loop hoisted') + loop_map.update({loop: comment for i_loop, loop in enumerate(loop_list) if i_loop != insert_loc}) + + # Apply transformation + routine.body = Transformer(loop_map).visit(routine.body) + info('%s: fused %d loops in %d groups.', routine.name, + sum(len(loop_list) for loop_list in fusion_groups.values()), len(fusion_groups)) class FissionTransformer(NestedMaskedTransformer): diff --git a/scripts/loki_transform.py b/scripts/loki_transform.py index f4d3bdccb..e01c68d1e 100644 --- a/scripts/loki_transform.py +++ b/scripts/loki_transform.py @@ -300,7 +300,7 @@ def convert( pipeline = scheduler.config.transformations.get('scc', None) if not pipeline: pipeline = SCCVectorPipeline( - horizontal=horizontal, + horizontal=horizontal, vertical=vertical, block_dim=block_dim, directive=directive, trim_vector_sections=trim_vector_sections ) @@ -310,7 +310,7 @@ def convert( pipeline = scheduler.config.transformations.get('scc-hoist', None) if not pipeline: pipeline = SCCHoistPipeline( - horizontal=horizontal, + horizontal=horizontal, vertical=vertical, block_dim=block_dim, directive=directive, dim_vars=(vertical.size,) if vertical else None, trim_vector_sections=trim_vector_sections @@ -321,7 +321,7 @@ def convert( pipeline = scheduler.config.transformations.get('scc-stack', None) if not pipeline: pipeline = SCCStackPipeline( - horizontal=horizontal, + horizontal=horizontal, vertical=vertical, block_dim=block_dim, directive=directive, check_bounds=False, trim_vector_sections=trim_vector_sections