Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HoistVariablesAnalysis: remove unused explicit interfaces after inlining #319

Merged
merged 4 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions loki/transformations/hoist_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def transform_subroutine(self, routine, **kwargs):
for child in successors:
if not isinstance(child, ProcedureItem):
continue

arg_map = dict(call_map[child.local_name].arg_iter())
hoist_variables = []
for var in child.trafo_data[self._key]["hoist_variables"]:
Expand Down
12 changes: 12 additions & 0 deletions loki/transformations/inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,18 @@ def inline_marked_subroutines(routine, allowed_aliases=None, adjust_imports=True
# Remove import if no further symbols used, otherwise clone with new symbols
import_map[impt] = impt.clone(symbols=new_symbols) if new_symbols else None

# Remove explicit interfaces of inlined routines
for intf in routine.interfaces:
if not intf.spec:
_body = tuple(
s.type.dtype.procedure for s in intf.symbols
if s.name not in callees or s.name in not_inlined
)
if _body:
import_map[intf] = intf.clone(body=_body)
else:
import_map[intf] = None

# Now move any callee imports we might need over to the caller
new_imports = set()
imported_module_map = CaseInsensitiveDict((im.module, im) for im in routine.imports)
Expand Down
11 changes: 6 additions & 5 deletions loki/transformations/single_column/hoist.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,13 @@ def driver_variable_declaration(self, routine, variables):

# Add explicit device-side allocations/deallocations for hoisted temporaries
vnames = ', '.join(v.name for v in variables)
pragma = ir.Pragma(keyword='acc', content=f'enter data create({vnames})')
pragma_post = ir.Pragma(keyword='acc', content=f'exit data delete({vnames})')
if vnames:
pragma = ir.Pragma(keyword='acc', content=f'enter data create({vnames})')
pragma_post = ir.Pragma(keyword='acc', content=f'exit data delete({vnames})')

# Add comments around standalone pragmas to avoid false attachment
routine.body.prepend((ir.Comment(''), pragma, ir.Comment('')))
routine.body.append((ir.Comment(''), pragma_post, ir.Comment('')))
# Add comments around standalone pragmas to avoid false attachment
routine.body.prepend((ir.Comment(''), pragma, ir.Comment('')))
routine.body.append((ir.Comment(''), pragma_post, ir.Comment('')))

def driver_call_argument_remapping(self, routine, call, variables):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,147 @@ def test_single_column_coalesced_hoist_nested_openacc(frontend, horizontal, vert
assert outer_kernel_pragmas[2].keyword == 'acc'
assert outer_kernel_pragmas[2].content == 'end data'


@pytest.mark.parametrize('frontend', available_frontends())
def test_single_column_coalesced_hoist_nested_inline_openacc(frontend, horizontal, vertical, blocking):
"""
Test the correct addition of OpenACC pragmas to SCC format code
when hoisting array temporaries to driver.
"""

fcode_driver = """
SUBROUTINE column_driver(nlon, nz, q, nb)
INTEGER, INTENT(IN) :: nlon, nz, nb ! Size of the horizontal and vertical
REAL, INTENT(INOUT) :: q(nlon,nz,nb)
INTEGER :: b, start, end

start = 1
end = nlon
do b=1, nb
call compute_column(start, end, nlon, nz, q(:,:,b))
end do
END SUBROUTINE column_driver
"""

fcode_outer_kernel = """
SUBROUTINE compute_column(start, end, nlon, nz, q)
INTEGER, INTENT(IN) :: start, end ! Iteration indices
INTEGER, INTENT(IN) :: nlon, nz ! Size of the horizontal and vertical
REAL, INTENT(INOUT) :: q(nlon,nz)
INTEGER :: jl, jk
REAL :: c

c = 5.345
DO JL = START, END
Q(JL, NZ) = Q(JL, NZ) + 1.0
END DO

!$loki inline
call update_q(start, end, nlon, nz, q, c)

DO JL = START, END
Q(JL, NZ) = Q(JL, NZ) * C
END DO
END SUBROUTINE compute_column
"""

fcode_inner_kernel = """
SUBROUTINE update_q(start, end, nlon, nz, q, c)
INTEGER, INTENT(IN) :: start, end ! Iteration indices
INTEGER, INTENT(IN) :: nlon, nz ! Size of the horizontal and vertical
REAL, INTENT(INOUT) :: q(nlon,nz)
REAL, INTENT(IN) :: c
REAL :: t(nlon,nz)
INTEGER :: jl, jk

DO jk = 2, nz
DO jl = start, end
t(jl, jk) = c * jk
q(jl, jk) = q(jl, jk-1) + t(jl, jk) * c
END DO
END DO
END SUBROUTINE update_q
"""

# Mimic the scheduler internal mechanis to apply the transformation cascade
outer_kernel_source = Sourcefile.from_source(fcode_outer_kernel, frontend=frontend)
inner_kernel_source = Sourcefile.from_source(fcode_inner_kernel, frontend=frontend)
driver_source = Sourcefile.from_source(fcode_driver, frontend=frontend)
driver = driver_source['column_driver']
outer_kernel = outer_kernel_source['compute_column']
inner_kernel = inner_kernel_source['update_q']
outer_kernel.enrich(inner_kernel) # Attach kernel source to driver call
driver.enrich(outer_kernel) # Attach kernel source to driver call

driver_item = ProcedureItem(name='#column_driver', source=driver)
outer_kernel_item = ProcedureItem(name='#compute_column', source=outer_kernel)
inner_kernel_item = ProcedureItem(name='#update_q', source=inner_kernel)

scc_hoist = SCCHoistPipeline(
horizontal=horizontal, block_dim=blocking,
dim_vars=(vertical.size,), directive='openacc'
)

InlineTransformation(allowed_aliases=horizontal.index).apply(outer_kernel)

# Apply in reverse order to ensure hoisting analysis gets run on kernel first
scc_hoist.apply(inner_kernel, role='kernel', item=inner_kernel_item)
scc_hoist.apply(
outer_kernel, role='kernel', item=outer_kernel_item,
targets=['compute_q'], successors=()
)
scc_hoist.apply(
driver, role='driver', item=driver_item,
targets=['compute_column'], successors=(outer_kernel_item,)
)

# Ensure calls have correct arguments
# driver
calls = FindNodes(CallStatement).visit(driver.body)
assert len(calls) == 1
assert calls[0].arguments == ('start', 'end', 'nlon', 'nz', 'q(:, :, b)',
'compute_column_t(:, :, b)')

# Ensure a single outer parallel loop in driver
with pragmas_attached(driver, Loop):
driver_loops = FindNodes(Loop).visit(driver.body)
assert len(driver_loops) == 1
assert driver_loops[0].variable == 'b'
assert driver_loops[0].bounds == '1:nb'
assert driver_loops[0].pragma[0].keyword == 'acc'
assert driver_loops[0].pragma[0].content == 'parallel loop gang vector_length(nlon)'

# Ensure we have a kernel call in the driver loop
kernel_calls = FindNodes(CallStatement).visit(driver_loops[0])
assert len(kernel_calls) == 1
assert kernel_calls[0].name == 'compute_column'

# Ensure that the intermediate kernel contains two wrapped loops and an unwrapped call statement
with pragmas_attached(outer_kernel, Loop):
outer_kernel_loops = FindNodes(Loop).visit(outer_kernel.body)
assert len(outer_kernel_loops) == 2
assert outer_kernel_loops[0].variable == 'jl'
assert outer_kernel_loops[0].bounds == 'start:end'
assert outer_kernel_loops[0].pragma[0].keyword == 'acc'
assert outer_kernel_loops[0].pragma[0].content == 'loop vector'

# check correctly nested vertical loop from inlined routine
assert outer_kernel_loops[1] in FindNodes(Loop).visit(outer_kernel_loops[0].body)

# Ensure the call was inlined
assert not FindNodes(CallStatement).visit(outer_kernel.body)

# Ensure the routine has been marked properly
outer_kernel_pragmas = FindNodes(Pragma).visit(outer_kernel.ir)
assert len(outer_kernel_pragmas) == 3
assert outer_kernel_pragmas[0].keyword == 'acc'
assert outer_kernel_pragmas[0].content == 'routine vector'
assert outer_kernel_pragmas[1].keyword == 'acc'
assert outer_kernel_pragmas[1].content == 'data present(q, t)'
assert outer_kernel_pragmas[2].keyword == 'acc'
assert outer_kernel_pragmas[2].content == 'end data'


@pytest.mark.parametrize('frontend', available_frontends())
def test_single_column_coalesced_nested(frontend, horizontal, blocking):
"""
Expand Down
103 changes: 103 additions & 0 deletions loki/transformations/tests/test_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,109 @@ def test_inline_marked_subroutines(frontend, adjust_imports):
assert imports[0].symbols == ('add_one', 'add_a_to_b')


@pytest.mark.parametrize('frontend', available_frontends())
def test_inline_marked_subroutines_with_interfaces(frontend):
""" Test inlining of subroutines with explicit interfaces via marker pragmas. """

fcode_driver = """
subroutine test_pragma_inline(a, b)
implicit none

interface
subroutine add_a_to_b(a, b, n)
real(kind=8), intent(inout) :: a(:), b(:)
integer, intent(in) :: n
end subroutine add_a_to_b
subroutine add_one(a)
real(kind=8), intent(inout) :: a
end subroutine add_one
end interface

interface
subroutine add_two(a)
real(kind=8), intent(inout) :: a
end subroutine add_two
end interface

real(kind=8), intent(inout) :: a(3), b(3)
integer, parameter :: n = 3
integer :: i

do i=1, n
!$loki inline
call add_one(a(i))
end do

!$loki inline
call add_a_to_b(a(:), b(:), 3)

do i=1, n
call add_one(b(i))
!$loki inline
call add_two(b(i))
end do

end subroutine test_pragma_inline
"""

fcode_module = """
module util_mod
implicit none

contains
subroutine add_one(a)
real(kind=8), intent(inout) :: a
a = a + 1
end subroutine add_one

subroutine add_two(a)
real(kind=8), intent(inout) :: a
a = a + 2
end subroutine add_two

subroutine add_a_to_b(a, b, n)
real(kind=8), intent(inout) :: a(:), b(:)
integer, intent(in) :: n
integer :: i

do i = 1, n
a(i) = a(i) + b(i)
end do
end subroutine add_a_to_b
end module util_mod
"""

module = Module.from_source(fcode_module, frontend=frontend)
driver = Subroutine.from_source(fcode_driver, frontend=frontend)
driver.enrich(module.subroutines)

calls = FindNodes(ir.CallStatement).visit(driver.body)
assert calls[0].routine == module['add_one']
assert calls[1].routine == module['add_a_to_b']
assert calls[2].routine == module['add_one']
assert calls[3].routine == module['add_two']

inline_marked_subroutines(routine=driver, allowed_aliases=('I',))

# Check inlined loops and assignments
assert len(FindNodes(ir.Loop).visit(driver.body)) == 3
assign = FindNodes(ir.Assignment).visit(driver.body)
assert len(assign) == 3
assert assign[0].lhs == 'a(i)' and assign[0].rhs == 'a(i) + 1'
assert assign[1].lhs == 'a(i)' and assign[1].rhs == 'a(i) + b(i)'
assert assign[2].lhs == 'b(i)' and assign[2].rhs == 'b(i) + 2'

# Check that the last call is left untouched
calls = FindNodes(ir.CallStatement).visit(driver.body)
assert len(calls) == 1
assert calls[0].routine.name == 'add_one'
assert calls[0].arguments == ('b(i)',)

intfs = FindNodes(ir.Interface).visit(driver.spec)
assert len(intfs) == 1
assert intfs[0].symbols == ('add_one',)


@pytest.mark.parametrize('frontend', available_frontends())
@pytest.mark.parametrize('adjust_imports', [True, False])
def test_inline_marked_routine_with_optionals(frontend, adjust_imports):
Expand Down
Loading