From 780b91bddd03cab2a58aa041c57ce6b673d2423f Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 10 Sep 2024 09:58:57 +0000 Subject: [PATCH 01/15] Loop fusion to account for pragmas when loop was fused and allow for some kind of ordering via potentially loking for 'insert' in the fuse pragma --- .../tests/test_transform_loop.py | 55 +++- loki/transformations/transform_loop.py | 268 +++++++++--------- 2 files changed, 191 insertions(+), 132 deletions(-) diff --git a/loki/transformations/tests/test_transform_loop.py b/loki/transformations/tests/test_transform_loop.py index a2209f4c3..dc50866d9 100644 --- a/loki/transformations/tests/test_transform_loop.py +++ b/loki/transformations/tests/test_transform_loop.py @@ -12,7 +12,7 @@ from loki import Subroutine from loki.build import jit_compile, clean_test -from loki.expression import symbols as sym +from loki.expression import symbols as sym, FindVariables from loki.frontend import available_frontends, OMNI from loki.ir import ( is_loki_pragma, pragmas_attached, FindNodes, Loop, Conditional, @@ -215,6 +215,59 @@ def test_transform_loop_interchange_project(tmp_path, frontend): clean_test(interchanged_filepath) +@pytest.mark.parametrize('frontend', available_frontends()) +@pytest.mark.parametrize('insert_loc', (False, True)) +def test_transform_loop_fuse_ordering(frontend, insert_loc): + """ + Apply loop fusion for two loops with matching iteration spaces. + """ + fcode = f""" +subroutine transform_loop_fuse_ordering(a, b, c, n, m) + integer, intent(out) :: a(m, n), b(m, n), c(m) + integer, intent(in) :: n, m + integer :: i + + !$loki loop-fusion group(1) + !$loki loop-interchange + do j=1,m + do i=1,n + a(j, i) = i + j + enddo + end do + + !$loki loop-fusion group(1) + do i=1,n + do j=1,m + a(j, i) = i + j + enddo + end do + + do j=1,m + c(j) = j + enddo + + !$loki loop-fusion group(1) {'insert' if insert_loc else ''} + do i=1,n-1 + do j=1,m + b(j, i) = n-i+1 + j + enddo + end do +end subroutine transform_loop_fuse_ordering +""" + routine = Subroutine.from_source(fcode, frontend=frontend) + assert len(FindNodes(Loop).visit(routine.body)) == 7 + loop_interchange(routine) + loop_fusion(routine) + loops = FindNodes(Loop).visit(routine.body) + assert len(loops) == 5 + loop_0_vars = [var.name.lower() for var in FindVariables().visit(loops[0].body)] + if insert_loc: + assert loops[0].variable.name.lower() == 'j' + assert 'c' in loop_0_vars + else: + assert loops[0].variable.name.lower() == 'i' + assert 'c' not in loop_0_vars + @pytest.mark.parametrize('frontend', available_frontends()) def test_transform_loop_fuse_matching(tmp_path, frontend): """ diff --git a/loki/transformations/transform_loop.py b/loki/transformations/transform_loop.py index a6f788522..8dc47b076 100644 --- a/loki/transformations/transform_loop.py +++ b/loki/transformations/transform_loop.py @@ -294,140 +294,146 @@ def loop_fusion(routine): group = parameters.get('group', 'default') fusion_groups[group] += [(loop, parameters)] - if not fusion_groups: - return - - # Merge loops in each group and put them in the position of the group's first loop - for group, loop_parameter_lists in fusion_groups.items(): - loop_list, parameters = zip(*loop_parameter_lists) - - # First, determine the collapse depth and extract user-annotated loop ranges from pragmas - collapse = [param.get('collapse', None) for param in parameters] - if collapse != [collapse[0]] * len(collapse): - raise RuntimeError(f'Conflicting collapse values in group "{group}"') - collapse = int(collapse[0]) if collapse[0] is not None else 1 - - pragma_ranges = [pragma_ranges_to_loop_ranges(param, routine) for param in parameters] - - # If we have a pragma somewhere with an explicit loop range, we use that for the fused loop - range_set = {r for r in pragma_ranges if r is not None} - if len(range_set) not in (0, 1): - raise RuntimeError(f'Pragma-specified loop ranges in group "{group}" do not match') - - fusion_ranges = None - if range_set: - fusion_ranges = range_set.pop() - - # Next, extract loop ranges for all loops in group and convert to iteration space - # polyhedrons for easier alignment - loop_variables, loop_ranges, loop_bodies = \ - zip(*[get_loop_components(get_nested_loops(loop, collapse)) for loop in loop_list]) - iteration_spaces = [Polyhedron.from_loop_ranges(variables, ranges) - for variables, ranges in zip(loop_variables, loop_ranges)] - - # Find the fused iteration space (if not given by a pragma) - if fusion_ranges is None: - fusion_ranges = [] - for level in range(collapse): - lower_bounds, upper_bounds = [], [] - ignored_variables = list(range(level+1, collapse)) - - for p in iteration_spaces: - for bound in p.lower_bounds(level, ignored_variables): - # Decide if we learn something new from this bound, which could be because: - # (1) we don't have any bounds, yet - # (2) bound is smaller than existing lower bounds (i.e. diff < 0) - # (3) bound is not constant and none of the existing bounds are lower (i.e. diff >= 0) - diff = [simplify(bound - b) for b in lower_bounds] - is_any_negative = any(is_constant(d) and symbolic_op(d, op.lt, 0) for d in diff) - is_any_not_negative = any(is_constant(d) and symbolic_op(d, op.ge, 0) for d in diff) - is_new_bound = (not lower_bounds or is_any_negative or - (not is_constant(bound) and not is_any_not_negative)) - if is_new_bound: - # Remove any lower bounds made redundant by bound: - lower_bounds = [b for b, d in zip(lower_bounds, diff) - if not (is_constant(d) and symbolic_op(d, op.lt, 0))] - lower_bounds += [bound] - - for bound in p.upper_bounds(level, ignored_variables): - # Decide if we learn something new from this bound, which could be because: - # (1) we don't have any bounds, yet - # (2) bound is larger than existing upper bounds (i.e. diff > 0) - # (3) bound is not constant and none of the existing bounds are larger (i.e. diff <= 0) - diff = [simplify(bound - b) for b in upper_bounds] - is_any_positive = any(is_constant(d) and symbolic_op(d, op.gt, 0) for d in diff) - is_any_not_positive = any(is_constant(d) and symbolic_op(d, op.le, 0) for d in diff) - is_new_bound = (not upper_bounds or is_any_positive or - (not is_constant(bound) and not is_any_not_positive)) - if is_new_bound: - # Remove any lower bounds made redundant by bound: - upper_bounds = [b for b, d in zip(upper_bounds, diff) - if not (is_constant(d) and symbolic_op(d, op.gt, 0))] - upper_bounds += [bound] - - if len(lower_bounds) == 1: - lower_bounds = lower_bounds[0] - else: - fct_symbol = sym.ProcedureSymbol('min', scope=routine) - lower_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(lower_bounds)) - - if len(upper_bounds) == 1: - upper_bounds = upper_bounds[0] - else: - fct_symbol = sym.ProcedureSymbol('max', scope=routine) - upper_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(upper_bounds)) - - fusion_ranges += [sym.LoopRange((lower_bounds, upper_bounds))] - - # Align loop ranges and collect bodies - fusion_bodies = [] - fusion_variables = loop_variables[0] - for idx, (variables, ranges, bodies, p) in enumerate( - zip(loop_variables, loop_ranges, loop_bodies, iteration_spaces)): - # TODO: This throws away anything that is not in the inner-most loop body. - body = flatten([Comment(f'! Loki loop-fusion - body {idx} begin'), - bodies[-1], - Comment(f'! Loki loop-fusion - body {idx} end')]) - - # Replace loop variables if necessary - var_map = {} - for loop_variable, fusion_variable in zip(variables, fusion_variables): - if loop_variable != fusion_variable: - var_map.update({var: fusion_variable for var in FindVariables().visit(body) - if var.name.lower() == loop_variable.name}) - if var_map: - body = SubstituteExpressions(var_map).visit(body) - - # Wrap in conditional if loop bounds are different - conditions = [] - for loop_range, fusion_range, variable in zip(ranges, fusion_ranges, fusion_variables): - if symbolic_op(loop_range.start, op.ne, fusion_range.start): - conditions += [sym.Comparison(variable, '>=', loop_range.start)] - if symbolic_op(loop_range.stop, op.ne, fusion_range.stop): - conditions += [sym.Comparison(variable, '<=', loop_range.stop)] - if conditions: - if len(conditions) == 1: - condition = conditions[0] - else: - condition = sym.LogicalAnd(as_tuple(conditions)) - body = Conditional(condition=condition, body=as_tuple(body), else_body=()) + if not fusion_groups: + return + + # Merge loops in each group and put them in the position of the group's first loop + # UNLESS 'insert' location is specified for at least one of the group's fusion + # pragmas, in this case the position is the first occurence of 'insert' for each group + for group, loop_parameter_lists in fusion_groups.items(): + loop_list, parameters = zip(*loop_parameter_lists) + + # First, determine the collapse depth and extract user-annotated loop ranges from pragmas + collapse = [param.get('collapse', None) for param in parameters] + insert_locs = [param.get('insert', False) for param in parameters] + print(f"insert_location: {insert_locs}") + if collapse != [collapse[0]] * len(collapse): + raise RuntimeError(f'Conflicting collapse values in group "{group}"') + collapse = int(collapse[0]) if collapse[0] is not None else 1 + + pragma_ranges = [pragma_ranges_to_loop_ranges(param, routine) for param in parameters] + + # If we have a pragma somewhere with an explicit loop range, we use that for the fused loop + range_set = {r for r in pragma_ranges if r is not None} + if len(range_set) not in (0, 1): + raise RuntimeError(f'Pragma-specified loop ranges in group "{group}" do not match') + + fusion_ranges = None + if range_set: + fusion_ranges = range_set.pop() + + # Next, extract loop ranges for all loops in group and convert to iteration space + # polyhedrons for easier alignment + loop_variables, loop_ranges, loop_bodies = \ + zip(*[get_loop_components(get_nested_loops(loop, collapse)) for loop in loop_list]) + iteration_spaces = [Polyhedron.from_loop_ranges(variables, ranges) + for variables, ranges in zip(loop_variables, loop_ranges)] + + # Find the fused iteration space (if not given by a pragma) + if fusion_ranges is None: + fusion_ranges = [] + for level in range(collapse): + lower_bounds, upper_bounds = [], [] + ignored_variables = list(range(level+1, collapse)) + + for p in iteration_spaces: + for bound in p.lower_bounds(level, ignored_variables): + # Decide if we learn something new from this bound, which could be because: + # (1) we don't have any bounds, yet + # (2) bound is smaller than existing lower bounds (i.e. diff < 0) + # (3) bound is not constant and none of the existing bounds are lower (i.e. diff >= 0) + diff = [simplify(bound - b) for b in lower_bounds] + is_any_negative = any(is_constant(d) and symbolic_op(d, op.lt, 0) for d in diff) + is_any_not_negative = any(is_constant(d) and symbolic_op(d, op.ge, 0) for d in diff) + is_new_bound = (not lower_bounds or is_any_negative or + (not is_constant(bound) and not is_any_not_negative)) + if is_new_bound: + # Remove any lower bounds made redundant by bound: + lower_bounds = [b for b, d in zip(lower_bounds, diff) + if not (is_constant(d) and symbolic_op(d, op.lt, 0))] + lower_bounds += [bound] + + for bound in p.upper_bounds(level, ignored_variables): + # Decide if we learn something new from this bound, which could be because: + # (1) we don't have any bounds, yet + # (2) bound is larger than existing upper bounds (i.e. diff > 0) + # (3) bound is not constant and none of the existing bounds are larger (i.e. diff <= 0) + diff = [simplify(bound - b) for b in upper_bounds] + is_any_positive = any(is_constant(d) and symbolic_op(d, op.gt, 0) for d in diff) + is_any_not_positive = any(is_constant(d) and symbolic_op(d, op.le, 0) for d in diff) + is_new_bound = (not upper_bounds or is_any_positive or + (not is_constant(bound) and not is_any_not_positive)) + if is_new_bound: + # Remove any lower bounds made redundant by bound: + upper_bounds = [b for b, d in zip(upper_bounds, diff) + if not (is_constant(d) and symbolic_op(d, op.gt, 0))] + upper_bounds += [bound] - fusion_bodies += [body] - - # Create the nested fused loop and replace original loops - fusion_loop = flatten(fusion_bodies) - for fusion_variable, fusion_range in zip(reversed(fusion_variables), reversed(fusion_ranges)): - fusion_loop = Loop(variable=fusion_variable, body=as_tuple(fusion_loop), bounds=fusion_range) + if len(lower_bounds) == 1: + lower_bounds = lower_bounds[0] + else: + fct_symbol = sym.DeferredTypeSymbol(name='min', scope=routine) + lower_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(lower_bounds)) - comment = Comment(f'! Loki loop-fusion group({group})') - loop_map[loop_list[0]] = (comment, fusion_loop) - comment = Comment(f'! Loki loop-fusion group({group}) - loop hoisted') - loop_map.update({loop: comment for loop in loop_list[1:]}) + if len(upper_bounds) == 1: + upper_bounds = upper_bounds[0] + else: + fct_symbol = sym.DeferredTypeSymbol(name='max', scope=routine) + upper_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(upper_bounds)) - # Apply transformation - routine.body = Transformer(loop_map).visit(routine.body) - info('%s: fused %d loops in %d groups.', routine.name, - sum(len(loop_list) for loop_list in fusion_groups.values()), len(fusion_groups)) + fusion_ranges += [sym.LoopRange((lower_bounds, upper_bounds))] + + # Align loop ranges and collect bodies + fusion_bodies = [] + fusion_variables = loop_variables[0] + for idx, (variables, ranges, bodies, p) in enumerate( + zip(loop_variables, loop_ranges, loop_bodies, iteration_spaces)): + # TODO: This throws away anything that is not in the inner-most loop body. + body = flatten([Comment(f'! Loki loop-fusion - body {idx} begin'), + bodies[-1], + Comment(f'! Loki loop-fusion - body {idx} end')]) + + # Replace loop variables if necessary + var_map = {} + for loop_variable, fusion_variable in zip(variables, fusion_variables): + if loop_variable != fusion_variable: + var_map.update({var: fusion_variable for var in FindVariables().visit(body) + if var.name.lower() == loop_variable.name}) + if var_map: + body = SubstituteExpressions(var_map).visit(body) + + # Wrap in conditional if loop bounds are different + conditions = [] + for loop_range, fusion_range, variable in zip(ranges, fusion_ranges, fusion_variables): + if symbolic_op(loop_range.start, op.ne, fusion_range.start): + conditions += [sym.Comparison(variable, '>=', loop_range.start)] + if symbolic_op(loop_range.stop, op.ne, fusion_range.stop): + conditions += [sym.Comparison(variable, '<=', loop_range.stop)] + if conditions: + if len(conditions) == 1: + condition = conditions[0] + else: + condition = sym.LogicalAnd(as_tuple(conditions)) + body = Conditional(condition=condition, body=as_tuple(body), else_body=()) + + fusion_bodies += [body] + + # Create the nested fused loop and replace original loops + fusion_loop = flatten(fusion_bodies) + for fusion_variable, fusion_range in zip(reversed(fusion_variables), reversed(fusion_ranges)): + fusion_loop = Loop(variable=fusion_variable, body=as_tuple(fusion_loop), bounds=fusion_range) + + comment = Comment(f'! Loki loop-fusion group({group})') + insert_loc = insert_locs.index(None) if None in insert_locs else 0 + loop_map[loop_list[insert_loc]] = (comment, Pragma(keyword='loki', + content=f'fused-loop group({group})'), fusion_loop) + comment = Comment(f'! Loki loop-fusion group({group}) - loop hoisted') + loop_map.update({loop: comment for i_loop, loop in enumerate(loop_list) if i_loop != insert_loc}) + + # Apply transformation + routine.body = Transformer(loop_map).visit(routine.body) + info('%s: fused %d loops in %d groups.', routine.name, + sum(len(loop_list) for loop_list in fusion_groups.values()), len(fusion_groups)) class FissionTransformer(NestedMaskedTransformer): From 4ac235a1dbd0563a2f34213a52fb7ab8ace6c6f0 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 10 Sep 2024 10:00:27 +0000 Subject: [PATCH 02/15] Introduce new transformation 'SCCFuseVerticalLoops' transformation, that fuses loops and demotes temporaries in the vertical dimension --- .../transformations/single_column/__init__.py | 1 + .../single_column/tests/test_scc_vertical.py | 249 ++++++++++++++++++ .../transformations/single_column/vertical.py | 224 ++++++++++++++++ 3 files changed, 474 insertions(+) create mode 100644 loki/transformations/single_column/tests/test_scc_vertical.py create mode 100644 loki/transformations/single_column/vertical.py diff --git a/loki/transformations/single_column/__init__.py b/loki/transformations/single_column/__init__.py index d5fe934d1..2ed513c63 100644 --- a/loki/transformations/single_column/__init__.py +++ b/loki/transformations/single_column/__init__.py @@ -13,3 +13,4 @@ from loki.transformations.single_column.scc_cuf import * # noqa from loki.transformations.single_column.vector import * # noqa from loki.transformations.single_column.scc_low_level import * # noqa +from loki.transformations.single_column.vertical import * # noqa diff --git a/loki/transformations/single_column/tests/test_scc_vertical.py b/loki/transformations/single_column/tests/test_scc_vertical.py new file mode 100644 index 000000000..bdd6dbbd2 --- /dev/null +++ b/loki/transformations/single_column/tests/test_scc_vertical.py @@ -0,0 +1,249 @@ +# (C) Copyright 2018- ECMWF. +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +import pytest + +from loki import Subroutine, Dimension +from loki.frontend import available_frontends +from loki.ir import FindNodes, Loop +from loki.expression import FindVariables +from loki.transformations.single_column import SCCFuseVerticalLoops + + +@pytest.fixture(scope='module', name='horizontal') +def fixture_horizontal(): + return Dimension( + name='horizontal', size='nlon', index='jl', + bounds=('start', 'end'), aliases=('nproma',) + ) + +@pytest.fixture(scope='module', name='horizontal_bounds_aliases') +def fixture_horizontal_bounds_aliases(): + return Dimension( + name='horizontal_bounds_aliases', size='nlon', index='jl', + bounds=('start', 'end'), aliases=('nproma',), + bounds_aliases=('bnds%start', 'bnds%end') + ) + +@pytest.fixture(scope='module', name='vertical') +def fixture_vertical(): + return Dimension(name='vertical', size='nz', index='jk', aliases=('nlev',)) + +@pytest.fixture(scope='module', name='blocking') +def fixture_blocking(): + return Dimension(name='blocking', size='nb', index='b') + + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_simple_scc_fuse_verticals_transformation(frontend, horizontal, vertical): + """ + Test simple example of vertical loop fusion and demotion of temporaries. + """ + + fcode_kernel = """ + SUBROUTINE compute_column(start, end, nlon, nz, q, t) + INTEGER, INTENT(IN) :: start, end ! Iteration indices + INTEGER, INTENT(IN) :: nlon, nz ! Size of the horizontal and vertical + REAL, INTENT(INOUT) :: t(nlon,nz) + REAL, INTENT(INOUT) :: q(nlon,nz) + REAL :: temp_t(nlon, nz) + REAL :: temp_q(nlon, nz) + INTEGER :: jl, jk + REAL :: c + + c = 5.345 + !$loki loop-fusion group(1) + DO jk = 1, nz + DO jl = start, end + temp_t(jl, jk) = c + temp_q(jl, jk) = c + END DO + END DO + + !$loki loop-fusion group(1) + DO jk = 2, nz + DO jl = start, end + t(jl, jk) = temp_t(jl, jk) * jk + q(jl, jk) = q(jl, jk-1) + t(jl, jk) * temp_q(jl, jk) + END DO + END DO + + ! The scaling is purposefully upper-cased + DO JL = START, END + Q(JL, NZ) = Q(JL, NZ) * C + END DO + END SUBROUTINE compute_column +""" + kernel = Subroutine.from_source(fcode_kernel, frontend=frontend) + + # Ensure we have three loops in the kernel prior to transformation + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 5 + + SCCFuseVerticalLoops(vertical=vertical).apply(kernel, role='kernel') + + # Ensure the two vertical loops are fused + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 4 + assert kernel_loops[0].variable.name.lower() == 'jk' + assert kernel_loops[-1].variable.name.lower() == 'jl' + assert len([loop for loop in kernel_loops if loop.variable.name.lower() == 'jk']) == 1 + kernel_var_map = kernel.variable_map + assert kernel_var_map['temp_t'].shape == (horizontal.size,) + assert kernel_var_map['temp_q'].shape == (horizontal.size,) + kernel_vars = [var for var in FindVariables().visit(kernel.body) if var.name.lower() in ['temp_t', 'temp_q']] + for var in kernel_vars: + assert var.shape == (horizontal.size,) + assert var.dimensions == (horizontal.index,) + + +@pytest.mark.parametrize('frontend', available_frontends()) +@pytest.mark.parametrize('ignore', (False, True)) +def test_scc_fuse_verticals_transformation(frontend, horizontal, vertical, ignore): + """ + Test somewhat more sophisticated example of vertical loop fusion + and demotion of temporaries. + """ + + fcode_kernel = f""" + SUBROUTINE compute_column(start, end, nlon, nz, q, t) + INTEGER, INTENT(IN) :: start, end ! Iteration indices + INTEGER, INTENT(IN) :: nlon, nz ! Size of the horizontal and vertical + REAL, INTENT(INOUT) :: t(nlon,nz) + REAL, INTENT(INOUT) :: q(nlon,nz) + REAL :: temp_t(nlon, nz) + REAL :: temp_t2(nlon, nz) + REAL :: temp_q(nlon, nz) + REAL :: temp_q2(nlon, nz) + REAL :: temp_cld(nlon, nz, 5) + INTEGER :: jl, jk, jm + REAL :: c + + {'!$loki k-caching ignore(temp_q2)' if ignore else ''} + + c = 5.345 + !$loki loop-fusion group(1-init) + DO jk = 1, nz + DO jl = start, end + temp_t(jl, jk) = c + temp_q(jl, jk) = c + temp_t2(jl, jk) = 2*c + END DO + END DO + + !$loki loop-fusion group(1) + !$loki loop-interchange + DO jm=1,5 + DO jk = 1, nz + DO jl = start, end + temp_cld(jl, jk, jm) = 3.1415 + END DO + END DO + END DO + + DO jl = start, end + q(jl, jk) = 0. + END DO + + !$loki loop-fusion group(1) insert + DO jk = 2, nz + DO jl = start, end + t(jl, jk) = temp_t(jl, jk) * temp_t2(jl, jk-1) * temp_cld(jl, jk, 1) + q(jl, jk) = q(jl, jk-1) + t(jl, jk) * temp_q(jl, jk) + END DO + END DO + + CALL nested_kernel(start, end, nlon, nz, q) + + !$loki loop-fusion group(2) + DO jk = 2, nz + DO jl = start, end + temp_q2(jl, jk) = 3.1415 + END DO + END DO + + !$loki loop-fusion group(2) + DO jk = 2, nz + DO jl = start, end + t(jl, jk) = t(jl, jk) + 3.1415 + q(jl, jk) = q(jl, jk-1) + t(jl, jk) * temp_q(jl, jk) + temp_q2(jl, jk) + END DO + END DO + + ! The scaling is purposefully upper-cased + DO JL = START, END + Q(JL, NZ) = Q(JL, NZ) * C + END DO + END SUBROUTINE compute_column +""" + + + kernel = Subroutine.from_source(fcode_kernel, frontend=frontend) + + # Ensure we have three loops in the kernel prior to transformation + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 13 + SCCFuseVerticalLoops(vertical=vertical).apply(kernel, role='kernel') + + # Ensure the two vertical loops are fused + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 12 + vertical_loops = [loop for loop in kernel_loops if loop.variable.name.lower() == vertical.index] + assert len(vertical_loops) == 3 + + shape1D = (horizontal.size,) + shape2D = (horizontal.size, vertical.size) + dimension1D = (horizontal.index,) + dimension2D = (horizontal.index,vertical.index) + dimension2DI1 = (horizontal.index, f'{vertical.index}-1') + + vertical_loop_0_vars = FindVariables().visit(vertical_loops[0].body) + vertical_loop_0_var_names = [var.name.lower() for var in vertical_loop_0_vars] + vertical_loop_0_var_dict = dict(zip(vertical_loop_0_var_names, vertical_loop_0_vars)) + assert 'temp_t2' in vertical_loop_0_var_names + assert 'temp_t' not in vertical_loop_0_var_names + assert 'temp_q' not in vertical_loop_0_var_names + assert 'temp_q2' not in vertical_loop_0_var_names + assert 'temp_cld' not in vertical_loop_0_var_names + assert vertical_loop_0_var_dict['temp_t2'].shape == shape2D + assert vertical_loop_0_var_dict['temp_t2'].dimensions == dimension2D + + vertical_loop_1_vars = FindVariables().visit(vertical_loops[1].body) + vertical_loop_1_var_names = [var.name.lower() for var in vertical_loop_1_vars] + vertical_loop_1_var_dict = dict(zip(vertical_loop_1_var_names, vertical_loop_1_vars)) + assert 'temp_t2' in vertical_loop_1_var_names + assert 'temp_t' in vertical_loop_1_var_names + assert 'temp_q' in vertical_loop_1_var_names + assert 'temp_q2' not in vertical_loop_1_vars + assert 'temp_cld' in vertical_loop_1_var_names + assert vertical_loop_1_var_dict['temp_t2'].shape == shape2D + assert vertical_loop_1_var_dict['temp_t2'].dimensions == dimension2DI1 + assert vertical_loop_1_var_dict['temp_t'].shape == shape1D + assert vertical_loop_1_var_dict['temp_t'].dimensions == dimension1D + assert vertical_loop_1_var_dict['temp_q'].shape == shape2D + assert vertical_loop_1_var_dict['temp_q'].dimensions == dimension2D + assert vertical_loop_1_var_dict['temp_cld'].shape == shape1D + (5,) + assert vertical_loop_1_var_dict['temp_cld'].dimensions in (dimension1D + (1,), dimension1D + ('jm',)) + + vertical_loop_2_vars = FindVariables().visit(vertical_loops[2].body) + vertical_loop_2_var_names = [var.name.lower() for var in vertical_loop_2_vars] + vertical_loop_2_var_dict = dict(zip(vertical_loop_2_var_names, vertical_loop_2_vars)) + assert 'temp_t2' not in vertical_loop_2_var_names + assert 'temp_t' not in vertical_loop_2_var_names + assert 'temp_q' in vertical_loop_2_var_names + assert 'temp_q2' in vertical_loop_2_var_names + assert 'temp_cld' not in vertical_loop_2_var_names + assert vertical_loop_2_var_dict['temp_q'].shape == shape2D + assert vertical_loop_2_var_dict['temp_q'].dimensions == dimension2D + assert vertical_loop_2_var_dict['temp_q2'].shape == shape2D if ignore else shape1D + assert vertical_loop_2_var_dict['temp_q2'].dimensions == dimension2D if ignore else dimension1D + + kernel_var_map = kernel.variable_map + assert kernel_var_map['temp_t'].shape == shape1D + assert kernel_var_map['temp_t2'].shape == shape2D + assert kernel_var_map['temp_q'].shape == shape2D + assert kernel_var_map['temp_q2'].shape == shape2D if ignore else shape1D diff --git a/loki/transformations/single_column/vertical.py b/loki/transformations/single_column/vertical.py new file mode 100644 index 000000000..265aadc0a --- /dev/null +++ b/loki/transformations/single_column/vertical.py @@ -0,0 +1,224 @@ +# (C) Copyright 2018- ECMWF. +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +from collections import defaultdict + +from loki.batch import Transformation +from loki.expression import ( + symbols as sym, FindVariables, +) +from loki.ir import ( + nodes as ir, FindNodes, Transformer, + is_loki_pragma, pragmas_attached, + get_pragma_parameters +) +from loki.tools import as_tuple +from loki.transformations.transform_loop import loop_fusion, loop_interchange +from loki.transformations.array_indexing import demote_variables + +__all__ = ['SCCFuseVerticalLoops'] + +class SCCFuseVerticalLoops(Transformation): + """ + A transformation to fuse vertical loops and demote temporaries in the vertical + dimension if possible. + + .. note:: + This transfomation currently relies on pragmas being inserted in the input + source files. + + Parameters + ---------- + vertical : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the vertical data dimension and iteration space. + """ + + def __init__(self, vertical=None): + self.vertical = vertical + + def transform_subroutine(self, routine, **kwargs): + """ + Fuse vertical loops and demote temporaries in the vertical dimension + if possible. + + Parameters + ---------- + routine : :any:`Subroutine` + The subroutine in the vertical loops should be fused and + temporaries be demoted. + horizontal : :any:`Dimension` + The dimension specifying the horizontal vector dimension + """ + role = kwargs['role'] + if role == 'kernel': + self.process_kernel(routine) + + def process_kernel(self, routine): + """ + Current logic (simplified): + + 1. loop interchange to expose vertical loops + 2. fuse vertical loops (possibly into multiple groups) + 3. find local arrays to be demoted and apply heuristics to check whether this is safe + 4. demote those arrays which are safe to be demoted + """ + # find local arrays with a vertical dimension + relevant_local_arrays = self.find_relevant_local_arrays(routine) + # find "multilevel" thus "jk +/- 1" arrays + multilevel_relevant_local_arrays = self.identify_multilevel_arrays(relevant_local_arrays) + # loop interchange to expose vertical loops as outermost loops + loop_interchange(routine) + # handle initialization of arrays "jk +/- 1" arrays + multilevel_relevant_local_arrays_names = set(arr.name.lower() for arr in multilevel_relevant_local_arrays) + self.correct_init_of_multilevel_arrays(routine, multilevel_relevant_local_arrays_names) + # fuse vertical loops + loop_fusion(routine) + # demote in vertical dimension if possible + relevant_local_arrays_names = set(arr.name.lower() for arr in relevant_local_arrays) + demote_candidates = relevant_local_arrays_names - multilevel_relevant_local_arrays_names + # check which variables are safe to demote in the vertical + safe_to_demote = self.check_safe_to_demote(routine, demote_candidates) + # demote locals in vertical dimension + dimensions_to_demote = self.vertical.size_expressions + (f"{self.vertical.size}+1",) + demote_variables(routine, safe_to_demote, dimensions_to_demote) + + def check_safe_to_demote(self, routine, demote_candidates): + """ + Check whether variables that are candidates to be demoted in the vertical dimension are really + safe to be demoted. + + Current heuristic: If the candidate is used in more than one vertical loop, assume it is NOT safe + to demote! + """ + fusion_groups = defaultdict(list) + loop_var_map = {} + with pragmas_attached(routine, ir.Loop): + # Extract all annotated loops and sort them into fusion groups + for loop in FindNodes(ir.Loop).visit(routine.body): + if is_loki_pragma(loop.pragma, starts_with='fused-loop'): + parameters = get_pragma_parameters(loop.pragma, starts_with='fused-loop') + group = parameters.get('group', 'default') + fusion_groups[group] += [(loop, parameters)] + else: + if loop.variable.name.lower() == self.vertical.index.lower(): + fusion_groups['no-group'] += [(loop, None)] + if not fusion_groups: + return demote_candidates + for group, loop_parameter_lists in fusion_groups.items(): + loop_list, parameters = zip(*loop_parameter_lists) + loop_var_map[group] = () + for loop in loop_list: + loop_var_map[group] += as_tuple(var.name.lower() for var in FindVariables().visit(loop.body) + if isinstance(var, sym.Array)) + safe_to_demote = () + for var in demote_candidates: + count = 0 + for group, var_names in loop_var_map.items(): + if group == 'ignore': + continue + if var in var_names: + count += 1 + if count <= 1: + safe_to_demote += (var,) + + return safe_to_demote + + def find_relevant_local_arrays(self, routine): + """ + Find local arrays/temporaries that do have the vertical dimension. + """ + # local/temporary arrays + arrays = [var for var in FindVariables(unique=False).visit(routine.body) if isinstance(var, sym.Array)] + argument_names = [arg.name.lower() for arg in routine.arguments] + local_arrays = [arr for arr in arrays if arr.name.lower() not in argument_names] + # only those with the vertical size within shape + relevant_local_arrays = [arr for arr in local_arrays if self.vertical.size.lower() + in [var.name.lower() for var in FindVariables().visit(arr.shape)]] + # filter arrays to be ignored (for whatever reason) + ignore_names = self.find_local_arrays_to_be_ignored(routine) + if ignore_names: + relevant_local_arrays = [arr for arr in relevant_local_arrays if arr.name.lower() not in ignore_names] + return relevant_local_arrays + + def find_local_arrays_to_be_ignored(self, routine): + """ + Identify variables to be ignore regarding demotion for whatever reason. + + Reasons are: + + * explicitly marked to be ignored via pragmas within the input source file, e.g., + 'loki k-caching ignore(var1, var2, ...)' + """ + ignore = () + pragmas = FindNodes(ir.Pragma).visit(routine.body) + # look for 'loki k-caching ignore(var1, var2, ...)' pragmas within routine and ignore those vars + for pragma in pragmas: + if is_loki_pragma(pragma, starts_with='k-caching'): + pragma_ignore = get_pragma_parameters(pragma, starts_with='k-caching').get('ignore', None) + if pragma_ignore: + ignore += as_tuple(v.strip() for v in pragma_ignore.split(',')) + ignore_names = set(var.lower() for var in ignore) + return ignore_names + + def identify_multilevel_arrays(self, local_arrays): + """ + Identify local arrays/temporaries that have an access in the vertical dimension + that is different to '', e.g., ' +/- 1' + """ + multilevel_local_arrays = [] + for arr in local_arrays: + for dim in arr.dimensions: + if self.vertical.index in FindVariables().visit(dim): + # dim is not equal to vertical.index e.g., vertical.index +/- 1 + if dim != self.vertical.index: + multilevel_local_arrays.append(arr) + return multilevel_local_arrays + + def correct_init_of_multilevel_arrays(self, routine, multilevel_local_arrays): + """ + Possibly handle initaliztion of those multilevel local arrays via + splitting relevant loops or rather creating a new node with the relevant + nodes moved to the newly created loop. + + .. note:: + This relies on pragmas being inserted in the input source code! + """ + loop_map = {} + # find/identify loops with pragma 'loop-fusion group(-init)' + with pragmas_attached(routine, ir.Loop): + loop_map = {} + for loop in FindNodes(ir.Loop).visit(routine.body): + if is_loki_pragma(loop.pragma, starts_with='loop-fusion'): + parameters = get_pragma_parameters(loop.pragma, starts_with='loop-fusion') + group = parameters.get('group', 'default') + if 'init' in group: + nodes_to_be_moved = () + nodes = FindNodes(ir.Assignment).visit(loop.body) + node_map = {} + node_map_init = {} + # find nodes that have multilevel arrays + for node in nodes: + node_vars = FindVariables().visit(node) + if any(node_var.name.lower() in multilevel_local_arrays for node_var in node_vars): + nodes_to_be_moved += (node,) + node_map[node] = None + else: + node_map_init[node] = None + # split the loop/create a new node to move those nodes with + # multilevel arrays to the new node + if nodes_to_be_moved: + pragmas = loop.pragma + new_pragmas = [pragma.clone(content=pragma.content.replace('-init', '')) if '-init' + in pragma.content else pragma for pragma in pragmas] + loop_map[loop] = (ir.Comment('! Loki generated loop for init ...'), + Transformer(node_map_init).visit(loop.clone(\ + pragma=as_tuple(ir.Pragma(keyword='loki', + content='fused-loop group(ignore)')))), + Transformer(node_map).visit(loop.clone(pragma=as_tuple(new_pragmas)))) + if loop_map: + routine.body = Transformer(loop_map).visit(routine.body) From 94d7668e3896342cf8d8e8e9e6cb6963d563f6a5 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 10 Sep 2024 10:01:27 +0000 Subject: [PATCH 03/15] Include 'SCCFuseVerticalLoops' in SCC pipelines --- loki/transformations/single_column/scc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/loki/transformations/single_column/scc.py b/loki/transformations/single_column/scc.py index 3b7f7d7dc..001ddca14 100644 --- a/loki/transformations/single_column/scc.py +++ b/loki/transformations/single_column/scc.py @@ -19,7 +19,7 @@ from loki.transformations.single_column.vector import ( SCCDevectorTransformation, SCCDemoteTransformation, SCCRevectorTransformation ) - +from loki.transformations.single_column.vertical import SCCFuseVerticalLoops __all__ = [ 'SCCVectorPipeline', 'SCCHoistPipeline', 'SCCStackPipeline', 'SCCRawStackPipeline' @@ -75,6 +75,7 @@ """ SCCVectorPipeline = partial( Pipeline, classes=( + SCCFuseVerticalLoops, SCCBaseTransformation, SCCDevectorTransformation, SCCDemoteTransformation, @@ -121,6 +122,7 @@ """ SCCHoistPipeline = partial( Pipeline, classes=( + SCCFuseVerticalLoops, SCCBaseTransformation, SCCDevectorTransformation, SCCDemoteTransformation, @@ -166,6 +168,7 @@ """ SCCStackPipeline = partial( Pipeline, classes=( + SCCFuseVerticalLoops, SCCBaseTransformation, SCCDevectorTransformation, SCCDemoteTransformation, From 12c9f03976ebfe4bf2eacb1e6f0093d3c46ec3f0 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 10 Sep 2024 10:02:18 +0000 Subject: [PATCH 04/15] Pass the vertical dimension to SCC pipelines (since this information is now necessary due to 'SCCFuseVerticalLoops') --- scripts/loki_transform.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/loki_transform.py b/scripts/loki_transform.py index f4d3bdccb..e01c68d1e 100644 --- a/scripts/loki_transform.py +++ b/scripts/loki_transform.py @@ -300,7 +300,7 @@ def convert( pipeline = scheduler.config.transformations.get('scc', None) if not pipeline: pipeline = SCCVectorPipeline( - horizontal=horizontal, + horizontal=horizontal, vertical=vertical, block_dim=block_dim, directive=directive, trim_vector_sections=trim_vector_sections ) @@ -310,7 +310,7 @@ def convert( pipeline = scheduler.config.transformations.get('scc-hoist', None) if not pipeline: pipeline = SCCHoistPipeline( - horizontal=horizontal, + horizontal=horizontal, vertical=vertical, block_dim=block_dim, directive=directive, dim_vars=(vertical.size,) if vertical else None, trim_vector_sections=trim_vector_sections @@ -321,7 +321,7 @@ def convert( pipeline = scheduler.config.transformations.get('scc-stack', None) if not pipeline: pipeline = SCCStackPipeline( - horizontal=horizontal, + horizontal=horizontal, vertical=vertical, block_dim=block_dim, directive=directive, check_bounds=False, trim_vector_sections=trim_vector_sections From 72f1164a6ac5f385f53adfbffa396039d2521f56 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 10 Sep 2024 13:38:11 +0000 Subject: [PATCH 05/15] [SCCFuseVerticalLoops] bail if vertical dimension is not defined, re-implemented check whether demotion is safe --- .../transformations/single_column/vertical.py | 38 ++++++++----------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/loki/transformations/single_column/vertical.py b/loki/transformations/single_column/vertical.py index 265aadc0a..6b62bfc50 100644 --- a/loki/transformations/single_column/vertical.py +++ b/loki/transformations/single_column/vertical.py @@ -5,8 +5,6 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. -from collections import defaultdict - from loki.batch import Transformation from loki.expression import ( symbols as sym, FindVariables, @@ -19,6 +17,7 @@ from loki.tools import as_tuple from loki.transformations.transform_loop import loop_fusion, loop_interchange from loki.transformations.array_indexing import demote_variables +from loki.logging import info __all__ = ['SCCFuseVerticalLoops'] @@ -54,6 +53,9 @@ def transform_subroutine(self, routine, **kwargs): horizontal : :any:`Dimension` The dimension specifying the horizontal vector dimension """ + if self.vertical is None: + info('[SCCFuseVerticalLoops] is not applied as the vertical dimension is not defined!') + return role = kwargs['role'] if role == 'kernel': self.process_kernel(routine) @@ -95,32 +97,24 @@ def check_safe_to_demote(self, routine, demote_candidates): Current heuristic: If the candidate is used in more than one vertical loop, assume it is NOT safe to demote! """ - fusion_groups = defaultdict(list) loop_var_map = {} with pragmas_attached(routine, ir.Loop): - # Extract all annotated loops and sort them into fusion groups for loop in FindNodes(ir.Loop).visit(routine.body): - if is_loki_pragma(loop.pragma, starts_with='fused-loop'): - parameters = get_pragma_parameters(loop.pragma, starts_with='fused-loop') - group = parameters.get('group', 'default') - fusion_groups[group] += [(loop, parameters)] - else: - if loop.variable.name.lower() == self.vertical.index.lower(): - fusion_groups['no-group'] += [(loop, None)] - if not fusion_groups: - return demote_candidates - for group, loop_parameter_lists in fusion_groups.items(): - loop_list, parameters = zip(*loop_parameter_lists) - loop_var_map[group] = () - for loop in loop_list: - loop_var_map[group] += as_tuple(var.name.lower() for var in FindVariables().visit(loop.body) - if isinstance(var, sym.Array)) + if loop.variable.name.lower() == self.vertical.index.lower(): + ignore = False + if is_loki_pragma(loop.pragma, starts_with='fused-loop'): + parameters = get_pragma_parameters(loop.pragma, starts_with='fused-loop') + group = parameters.get('group', 'default') + if group == 'ignore': + ignore = True + if not ignore: + loop_var_map[loop] = as_tuple(var.name.lower() for var in FindVariables().visit(loop.body) + if isinstance(var, sym.Array)) + safe_to_demote = () for var in demote_candidates: count = 0 - for group, var_names in loop_var_map.items(): - if group == 'ignore': - continue + for _, var_names in loop_var_map.items(): if var in var_names: count += 1 if count <= 1: From 2cde419a799c921d406c132ff672e404f42397c6 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 10 Sep 2024 13:38:39 +0000 Subject: [PATCH 06/15] loop_fusion: remove debug print statement --- loki/transformations/transform_loop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loki/transformations/transform_loop.py b/loki/transformations/transform_loop.py index 8dc47b076..f19bb4474 100644 --- a/loki/transformations/transform_loop.py +++ b/loki/transformations/transform_loop.py @@ -306,7 +306,6 @@ def loop_fusion(routine): # First, determine the collapse depth and extract user-annotated loop ranges from pragmas collapse = [param.get('collapse', None) for param in parameters] insert_locs = [param.get('insert', False) for param in parameters] - print(f"insert_location: {insert_locs}") if collapse != [collapse[0]] * len(collapse): raise RuntimeError(f'Conflicting collapse values in group "{group}"') collapse = int(collapse[0]) if collapse[0] is not None else 1 From 1c4b44b69f634cc937e6f96180fe28bea012f957 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 10 Sep 2024 16:59:19 +0200 Subject: [PATCH 07/15] Scheduler test: account for new trafo in SCCVectorPipeline --- loki/batch/tests/test_scheduler.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/loki/batch/tests/test_scheduler.py b/loki/batch/tests/test_scheduler.py index bd411ca95..b3d42948b 100644 --- a/loki/batch/tests/test_scheduler.py +++ b/loki/batch/tests/test_scheduler.py @@ -2887,21 +2887,22 @@ def test_pipeline_config_compose(config): assert isinstance(pipeline, Pipeline) # Check that the pipeline is correctly composed - assert len(pipeline.transformations) == 7 + assert len(pipeline.transformations) == 8 assert type(pipeline.transformations[0]).__name__ == 'RemoveCodeTransformation' - assert type(pipeline.transformations[1]).__name__ == 'SCCBaseTransformation' - assert type(pipeline.transformations[2]).__name__ == 'SCCDevectorTransformation' - assert type(pipeline.transformations[3]).__name__ == 'SCCDemoteTransformation' - assert type(pipeline.transformations[4]).__name__ == 'SCCRevectorTransformation' - assert type(pipeline.transformations[5]).__name__ == 'SCCAnnotateTransformation' - assert type(pipeline.transformations[6]).__name__ == 'ModuleWrapTransformation' + assert type(pipeline.transformations[1]).__name__ == 'SCCFuseVerticalLoops' + assert type(pipeline.transformations[2]).__name__ == 'SCCBaseTransformation' + assert type(pipeline.transformations[3]).__name__ == 'SCCDevectorTransformation' + assert type(pipeline.transformations[4]).__name__ == 'SCCDemoteTransformation' + assert type(pipeline.transformations[5]).__name__ == 'SCCRevectorTransformation' + assert type(pipeline.transformations[6]).__name__ == 'SCCAnnotateTransformation' + assert type(pipeline.transformations[7]).__name__ == 'ModuleWrapTransformation' # Check for some specified and default constructor flags assert pipeline.transformations[0].call_names == ('dr_hook',) assert pipeline.transformations[0].remove_imports is False - assert isinstance(pipeline.transformations[1].horizontal, Dimension) - assert pipeline.transformations[1].horizontal.size == 'KLON' - assert pipeline.transformations[1].horizontal.index == 'JL' - assert pipeline.transformations[1].directive == 'openacc' - assert pipeline.transformations[2].trim_vector_sections is True - assert pipeline.transformations[6].replace_ignore_items is True + assert isinstance(pipeline.transformations[2].horizontal, Dimension) + assert pipeline.transformations[2].horizontal.size == 'KLON' + assert pipeline.transformations[2].horizontal.index == 'JL' + assert pipeline.transformations[2].directive == 'openacc' + assert pipeline.transformations[3].trim_vector_sections is True + assert pipeline.transformations[7].replace_ignore_items is True From b40ae8a3eb7e115952ef8f272677a0bb1d2604cf Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 1 Oct 2024 14:41:14 +0000 Subject: [PATCH 08/15] SCCFuseVerticalLoops: minor refactoring and simplifications --- .../single_column/tests/test_scc_vertical.py | 6 +-- .../transformations/single_column/vertical.py | 54 +++++++++---------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/loki/transformations/single_column/tests/test_scc_vertical.py b/loki/transformations/single_column/tests/test_scc_vertical.py index bdd6dbbd2..d3ed091ad 100644 --- a/loki/transformations/single_column/tests/test_scc_vertical.py +++ b/loki/transformations/single_column/tests/test_scc_vertical.py @@ -52,15 +52,15 @@ def test_simple_scc_fuse_verticals_transformation(frontend, horizontal, vertical REAL, INTENT(INOUT) :: q(nlon,nz) REAL :: temp_t(nlon, nz) REAL :: temp_q(nlon, nz) - INTEGER :: jl, jk + INTEGER :: jl, JK REAL :: c c = 5.345 !$loki loop-fusion group(1) - DO jk = 1, nz + DO JK = 1, nz DO jl = start, end temp_t(jl, jk) = c - temp_q(jl, jk) = c + temp_q(jl, JK) = c END DO END DO diff --git a/loki/transformations/single_column/vertical.py b/loki/transformations/single_column/vertical.py index 6b62bfc50..f15efa19c 100644 --- a/loki/transformations/single_column/vertical.py +++ b/loki/transformations/single_column/vertical.py @@ -14,9 +14,10 @@ is_loki_pragma, pragmas_attached, get_pragma_parameters ) -from loki.tools import as_tuple +from loki.tools import as_tuple, CaseInsensitiveDict from loki.transformations.transform_loop import loop_fusion, loop_interchange from loki.transformations.array_indexing import demote_variables +from loki.transformations.utilities import get_local_arrays from loki.logging import info __all__ = ['SCCFuseVerticalLoops'] @@ -28,7 +29,13 @@ class SCCFuseVerticalLoops(Transformation): .. note:: This transfomation currently relies on pragmas being inserted in the input - source files. + source files. Relevant pragmas are `!$loki loop-interchange` to expose the + vertical loops (in case vertical loops are nested) and `!$loki loop-fusion` + possibly grouped via `group()`. Further, if there are loops + that initialize multilevel arrays (`jk +/- 1`) it is possible to mark those + loops as `!$loki loop-fusion group(-init)`. This allows to split + the relevant node and moves the initialization of those arrays to the top of + the group. Parameters ---------- @@ -50,8 +57,6 @@ def transform_subroutine(self, routine, **kwargs): routine : :any:`Subroutine` The subroutine in the vertical loops should be fused and temporaries be demoted. - horizontal : :any:`Dimension` - The dimension specifying the horizontal vector dimension """ if self.vertical is None: info('[SCCFuseVerticalLoops] is not applied as the vertical dimension is not defined!') @@ -97,27 +102,22 @@ def check_safe_to_demote(self, routine, demote_candidates): Current heuristic: If the candidate is used in more than one vertical loop, assume it is NOT safe to demote! """ - loop_var_map = {} + loop_var_map = CaseInsensitiveDict() with pragmas_attached(routine, ir.Loop): for loop in FindNodes(ir.Loop).visit(routine.body): - if loop.variable.name.lower() == self.vertical.index.lower(): - ignore = False + if loop.variable == self.vertical.index: if is_loki_pragma(loop.pragma, starts_with='fused-loop'): parameters = get_pragma_parameters(loop.pragma, starts_with='fused-loop') group = parameters.get('group', 'default') if group == 'ignore': - ignore = True - if not ignore: - loop_var_map[loop] = as_tuple(var.name.lower() for var in FindVariables().visit(loop.body) - if isinstance(var, sym.Array)) + continue + for var in FindVariables().visit(loop.body): + if isinstance(var, sym.Array): + loop_var_map.setdefault(var.name, set()).add(group) safe_to_demote = () for var in demote_candidates: - count = 0 - for _, var_names in loop_var_map.items(): - if var in var_names: - count += 1 - if count <= 1: + if var in loop_var_map and len(loop_var_map[var]) <= 1: safe_to_demote += (var,) return safe_to_demote @@ -127,12 +127,10 @@ def find_relevant_local_arrays(self, routine): Find local arrays/temporaries that do have the vertical dimension. """ # local/temporary arrays - arrays = [var for var in FindVariables(unique=False).visit(routine.body) if isinstance(var, sym.Array)] - argument_names = [arg.name.lower() for arg in routine.arguments] - local_arrays = [arr for arr in arrays if arr.name.lower() not in argument_names] + local_arrays = get_local_arrays(routine, routine.body) # only those with the vertical size within shape relevant_local_arrays = [arr for arr in local_arrays if self.vertical.size.lower() - in [var.name.lower() for var in FindVariables().visit(arr.shape)]] + in FindVariables().visit(arr.shape)] # filter arrays to be ignored (for whatever reason) ignore_names = self.find_local_arrays_to_be_ignored(routine) if ignore_names: @@ -153,8 +151,7 @@ def find_local_arrays_to_be_ignored(self, routine): # look for 'loki k-caching ignore(var1, var2, ...)' pragmas within routine and ignore those vars for pragma in pragmas: if is_loki_pragma(pragma, starts_with='k-caching'): - pragma_ignore = get_pragma_parameters(pragma, starts_with='k-caching').get('ignore', None) - if pragma_ignore: + if pragma_ignore := get_pragma_parameters(pragma, starts_with='k-caching').get('ignore', None): ignore += as_tuple(v.strip() for v in pragma_ignore.split(',')) ignore_names = set(var.lower() for var in ignore) return ignore_names @@ -175,7 +172,7 @@ def identify_multilevel_arrays(self, local_arrays): def correct_init_of_multilevel_arrays(self, routine, multilevel_local_arrays): """ - Possibly handle initaliztion of those multilevel local arrays via + Possibly handle initialization of those multilevel local arrays via splitting relevant loops or rather creating a new node with the relevant nodes moved to the newly created loop. @@ -209,10 +206,13 @@ def correct_init_of_multilevel_arrays(self, routine, multilevel_local_arrays): pragmas = loop.pragma new_pragmas = [pragma.clone(content=pragma.content.replace('-init', '')) if '-init' in pragma.content else pragma for pragma in pragmas] + # init part + transf_init = Transformer(node_map_init).visit(loop.clone(\ + pragma=as_tuple(ir.Pragma(keyword='loki', + content='fused-loop group(ignore)')))) + # rest of the original node/loop + transf_orig = Transformer(node_map).visit(loop.clone(pragma=as_tuple(new_pragmas))) loop_map[loop] = (ir.Comment('! Loki generated loop for init ...'), - Transformer(node_map_init).visit(loop.clone(\ - pragma=as_tuple(ir.Pragma(keyword='loki', - content='fused-loop group(ignore)')))), - Transformer(node_map).visit(loop.clone(pragma=as_tuple(new_pragmas)))) + transf_init, transf_orig) if loop_map: routine.body = Transformer(loop_map).visit(routine.body) From 6538ac2a377f3546b58759b619966fcc7800639c Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Mon, 7 Oct 2024 21:27:22 +0200 Subject: [PATCH 09/15] (vertical) loop fusion: more strongly guard of '-init' ending --- loki/transformations/single_column/vertical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loki/transformations/single_column/vertical.py b/loki/transformations/single_column/vertical.py index f15efa19c..0027ef69f 100644 --- a/loki/transformations/single_column/vertical.py +++ b/loki/transformations/single_column/vertical.py @@ -187,7 +187,7 @@ def correct_init_of_multilevel_arrays(self, routine, multilevel_local_arrays): if is_loki_pragma(loop.pragma, starts_with='loop-fusion'): parameters = get_pragma_parameters(loop.pragma, starts_with='loop-fusion') group = parameters.get('group', 'default') - if 'init' in group: + if group.endswith('-init'): nodes_to_be_moved = () nodes = FindNodes(ir.Assignment).visit(loop.body) node_map = {} From 6b3bac8601692fdc21047cd8fa9348e127332174 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Mon, 7 Oct 2024 22:00:50 +0200 Subject: [PATCH 10/15] loop fusion: use 'insert-loc' instead of 'insert' --- loki/transformations/tests/test_transform_loop.py | 2 +- loki/transformations/transform_loop.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/loki/transformations/tests/test_transform_loop.py b/loki/transformations/tests/test_transform_loop.py index dc50866d9..4123a5c35 100644 --- a/loki/transformations/tests/test_transform_loop.py +++ b/loki/transformations/tests/test_transform_loop.py @@ -246,7 +246,7 @@ def test_transform_loop_fuse_ordering(frontend, insert_loc): c(j) = j enddo - !$loki loop-fusion group(1) {'insert' if insert_loc else ''} + !$loki loop-fusion group(1) {'insert-loc' if insert_loc else ''} do i=1,n-1 do j=1,m b(j, i) = n-i+1 + j diff --git a/loki/transformations/transform_loop.py b/loki/transformations/transform_loop.py index f19bb4474..462c110f7 100644 --- a/loki/transformations/transform_loop.py +++ b/loki/transformations/transform_loop.py @@ -298,14 +298,14 @@ def loop_fusion(routine): return # Merge loops in each group and put them in the position of the group's first loop - # UNLESS 'insert' location is specified for at least one of the group's fusion - # pragmas, in this case the position is the first occurence of 'insert' for each group + # UNLESS 'insert-loc' location is specified for at least one of the group's fusion + # pragmas, in this case the position is the first occurence of 'insert-loc' for each group for group, loop_parameter_lists in fusion_groups.items(): loop_list, parameters = zip(*loop_parameter_lists) # First, determine the collapse depth and extract user-annotated loop ranges from pragmas collapse = [param.get('collapse', None) for param in parameters] - insert_locs = [param.get('insert', False) for param in parameters] + insert_locs = [param.get('insert-loc', False) for param in parameters] if collapse != [collapse[0]] * len(collapse): raise RuntimeError(f'Conflicting collapse values in group "{group}"') collapse = int(collapse[0]) if collapse[0] is not None else 1 From a9f72870b2cc4dc9ea94edacc8c30d64f8aded3b Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Mon, 7 Oct 2024 22:12:56 +0200 Subject: [PATCH 11/15] please linter --- .../tests/test_transform_loop.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/loki/transformations/tests/test_transform_loop.py b/loki/transformations/tests/test_transform_loop.py index 4123a5c35..73bd671f8 100644 --- a/loki/transformations/tests/test_transform_loop.py +++ b/loki/transformations/tests/test_transform_loop.py @@ -13,7 +13,7 @@ from loki import Subroutine from loki.build import jit_compile, clean_test from loki.expression import symbols as sym, FindVariables -from loki.frontend import available_frontends, OMNI +from loki.frontend import available_frontends from loki.ir import ( is_loki_pragma, pragmas_attached, FindNodes, Loop, Conditional, Assignment @@ -1684,7 +1684,7 @@ def test_transform_loop_unroll(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([x + 1 for x in range(1, 11)]) + assert s == sum(x + 1 for x in range(1, 11)) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 1 @@ -1697,7 +1697,7 @@ def test_transform_loop_unroll(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([x + 1 for x in range(1, 11)]) + assert s == sum(x + 1 for x in range(1, 11)) clean_test(filepath) clean_test(unrolled_filepath) @@ -1726,7 +1726,7 @@ def test_transform_loop_unroll_step(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([x + 1 for x in range(1, 11, 2)]) + assert s == sum(x + 1 for x in range(1, 11, 2)) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 1 @@ -1739,7 +1739,7 @@ def test_transform_loop_unroll_step(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([x + 1 for x in range(1, 11, 2)]) + assert s == sum(x + 1 for x in range(1, 11, 2)) clean_test(filepath) clean_test(unrolled_filepath) @@ -1770,7 +1770,7 @@ def test_transform_loop_unroll_non_literal_range(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([x + 1 for x in range(1, 11)]) + assert s == sum(x + 1 for x in range(1, 11)) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 1 @@ -1783,7 +1783,7 @@ def test_transform_loop_unroll_non_literal_range(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([x + 1 for x in range(1, 11)]) + assert s == sum(x + 1 for x in range(1, 11)) clean_test(filepath) clean_test(unrolled_filepath) @@ -1815,7 +1815,7 @@ def test_transform_loop_unroll_nested(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 2 @@ -1828,7 +1828,7 @@ def test_transform_loop_unroll_nested(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) clean_test(filepath) clean_test(unrolled_filepath) @@ -1860,7 +1860,7 @@ def test_transform_loop_unroll_nested_restricted_depth(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 2 @@ -1873,7 +1873,7 @@ def test_transform_loop_unroll_nested_restricted_depth(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) clean_test(filepath) clean_test(unrolled_filepath) @@ -1907,7 +1907,7 @@ def test_transform_loop_unroll_nested_restricted_depth_unrollable(tmp_path, fron # Test the reference solution s = np.zeros(1) function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 2 @@ -1920,7 +1920,7 @@ def test_transform_loop_unroll_nested_restricted_depth_unrollable(tmp_path, fron # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) clean_test(filepath) clean_test(unrolled_filepath) @@ -1968,7 +1968,7 @@ def test_transform_loop_unroll_nested_counters(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 11)) if b <= a]) + assert s == sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 11)) if b <= a) clean_test(filepath) clean_test(unrolled_filepath) @@ -2006,7 +2006,7 @@ def test_transform_loop_unroll_nested_neighbours(tmp_path, frontend): # Test the reference solution s = np.zeros(1) function(s=s) - assert s == 2 * sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == 2 * sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) # Apply transformation assert len(FindNodes(Loop).visit(routine.body)) == 3 loop_unroll(routine) @@ -2018,7 +2018,7 @@ def test_transform_loop_unroll_nested_neighbours(tmp_path, frontend): # Test transformation s = np.zeros(1) unrolled_function(s=s) - assert s == 2 * sum([a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))]) + assert s == 2 * sum(a + b + 1 for (a, b) in itertools.product(range(1, 11), range(1, 6))) clean_test(filepath) clean_test(unrolled_filepath) From 29e5186b1f7bc1170f26f5cfada0eeb869dccdca Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 8 Oct 2024 09:13:29 +0200 Subject: [PATCH 12/15] fix imports: 'FindVariables' has been moved to loki/ir --- loki/transformations/single_column/tests/test_scc_vertical.py | 3 +-- loki/transformations/single_column/vertical.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/loki/transformations/single_column/tests/test_scc_vertical.py b/loki/transformations/single_column/tests/test_scc_vertical.py index d3ed091ad..74cc97988 100644 --- a/loki/transformations/single_column/tests/test_scc_vertical.py +++ b/loki/transformations/single_column/tests/test_scc_vertical.py @@ -9,8 +9,7 @@ from loki import Subroutine, Dimension from loki.frontend import available_frontends -from loki.ir import FindNodes, Loop -from loki.expression import FindVariables +from loki.ir import FindNodes, Loop, FindVariables from loki.transformations.single_column import SCCFuseVerticalLoops diff --git a/loki/transformations/single_column/vertical.py b/loki/transformations/single_column/vertical.py index 0027ef69f..32e9caaef 100644 --- a/loki/transformations/single_column/vertical.py +++ b/loki/transformations/single_column/vertical.py @@ -7,12 +7,12 @@ from loki.batch import Transformation from loki.expression import ( - symbols as sym, FindVariables, + symbols as sym ) from loki.ir import ( nodes as ir, FindNodes, Transformer, is_loki_pragma, pragmas_attached, - get_pragma_parameters + get_pragma_parameters, FindVariables ) from loki.tools import as_tuple, CaseInsensitiveDict from loki.transformations.transform_loop import loop_fusion, loop_interchange From b12a33d047dbd7b17c3b48bb12c1eb60b5810c7e Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 8 Oct 2024 09:41:59 +0200 Subject: [PATCH 13/15] SCCFuseVerticalLoops: introduce new arg 'apply_to' to possibly restrict routines this transformation is applied to --- .../single_column/tests/test_scc_vertical.py | 7 +++++++ loki/transformations/single_column/vertical.py | 8 +++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/loki/transformations/single_column/tests/test_scc_vertical.py b/loki/transformations/single_column/tests/test_scc_vertical.py index 74cc97988..3a476a882 100644 --- a/loki/transformations/single_column/tests/test_scc_vertical.py +++ b/loki/transformations/single_column/tests/test_scc_vertical.py @@ -83,6 +83,13 @@ def test_simple_scc_fuse_verticals_transformation(frontend, horizontal, vertical kernel_loops = FindNodes(Loop).visit(kernel.body) assert len(kernel_loops) == 5 + # no-op as 'compute_column' is not within apply_to + SCCFuseVerticalLoops(vertical=vertical, apply_to=('another_kernel',)).apply(kernel, role='kernel') + # Ensure we have three loops in the kernel prior to transformation + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 5 + + # actual loop fusion and demotion ... (as apply_to is not provided and therefore all routines are dispatched) SCCFuseVerticalLoops(vertical=vertical).apply(kernel, role='kernel') # Ensure the two vertical loops are fused diff --git a/loki/transformations/single_column/vertical.py b/loki/transformations/single_column/vertical.py index 32e9caaef..d19f11818 100644 --- a/loki/transformations/single_column/vertical.py +++ b/loki/transformations/single_column/vertical.py @@ -42,10 +42,14 @@ class SCCFuseVerticalLoops(Transformation): vertical : :any:`Dimension` :any:`Dimension` object describing the variable conventions used in code to define the vertical data dimension and iteration space. + apply_to : list of str, optional + list of routines to apply this transformation to, if not provided or None + apply to all routines (default: None) """ - def __init__(self, vertical=None): + def __init__(self, vertical=None, apply_to=None): self.vertical = vertical + self.apply_to = apply_to or () def transform_subroutine(self, routine, **kwargs): """ @@ -63,6 +67,8 @@ def transform_subroutine(self, routine, **kwargs): return role = kwargs['role'] if role == 'kernel': + if self.apply_to and routine.name.lower() not in self.apply_to: + return self.process_kernel(routine) def process_kernel(self, routine): From 38a0a9b252f0546ac07d45528053e65f59bc4421 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 8 Oct 2024 10:09:21 +0200 Subject: [PATCH 14/15] fix imports (again): 'FindVariables' has been moved to loki/ir --- loki/transformations/tests/test_transform_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loki/transformations/tests/test_transform_loop.py b/loki/transformations/tests/test_transform_loop.py index 73bd671f8..b7e04c607 100644 --- a/loki/transformations/tests/test_transform_loop.py +++ b/loki/transformations/tests/test_transform_loop.py @@ -12,11 +12,11 @@ from loki import Subroutine from loki.build import jit_compile, clean_test -from loki.expression import symbols as sym, FindVariables +from loki.expression import symbols as sym from loki.frontend import available_frontends from loki.ir import ( is_loki_pragma, pragmas_attached, FindNodes, Loop, Conditional, - Assignment + Assignment, FindVariables ) from loki.transformations.transform_loop import ( From 7a65ebe41ed928bbaf1834770e62f74e8aeb9d21 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Tue, 8 Oct 2024 10:11:32 +0200 Subject: [PATCH 15/15] loop fusion: mention issue regarding provision of type information for functions defined by the Fortran standard (#390) --- loki/transformations/transform_loop.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/loki/transformations/transform_loop.py b/loki/transformations/transform_loop.py index 462c110f7..29299b4eb 100644 --- a/loki/transformations/transform_loop.py +++ b/loki/transformations/transform_loop.py @@ -371,12 +371,14 @@ def loop_fusion(routine): if len(lower_bounds) == 1: lower_bounds = lower_bounds[0] else: + # TODO: could/should be ProcedureSymbol, however refer to issue: #390 fct_symbol = sym.DeferredTypeSymbol(name='min', scope=routine) lower_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(lower_bounds)) if len(upper_bounds) == 1: upper_bounds = upper_bounds[0] else: + # TODO: could/should be ProcedureSymbol, however refer to issue: #390 fct_symbol = sym.DeferredTypeSymbol(name='max', scope=routine) upper_bounds = sym.InlineCall(fct_symbol, parameters=as_tuple(upper_bounds))