PTX: LLVM assertion error during function merging #461

maleadt · 2023-05-26T15:47:55Z

After #444, running the CUDA.jl test suite with LLVM assertions throws:

opt: /workspace/srcdir/llvm-project/llvm/lib/IR/Operator.cpp:91: bool llvm::GEPOperator::accumulateConstantOffset(const llvm::DataLayout&, llvm::APInt&, llvm::function_ref<bool(llvm::Value&, llvm::APInt&)>) const: Assertion `Offset.getBitWidth() == DL.getIndexSizeInBits(getPointerAddressSpace()) && "The offset bit width does not match DL specification."' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /home/tim/Julia/src/julia/build/dev/usr/tools/opt wip.ll --mergefunc -o /dev/null
 #0 0x00007f09df31871f PrintStackTraceSignalHandler(void*) Signals.cpp:0:0
 #1 0x00007f09df31623c SignalHandler(int) Signals.cpp:0:0
 #2 0x00007f09de24fab0 (/usr/lib/libc.so.6+0x39ab0)
 #3 0x00007f09de29f26c (/usr/lib/libc.so.6+0x8926c)
 #4 0x00007f09de24fa08 raise (/usr/lib/libc.so.6+0x39a08)
 #5 0x00007f09de238538 abort (/usr/lib/libc.so.6+0x22538)
 #6 0x00007f09de23845c (/usr/lib/libc.so.6+0x2245c)
 #7 0x00007f09de2483d6 (/usr/lib/libc.so.6+0x323d6)
 #8 0x00007f09df4c1571 llvm::GEPOperator::accumulateConstantOffset(llvm::DataLayout const&, llvm::APInt&, llvm::function_ref<bool (llvm::Value&, llvm::APInt&)>) const (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0xcc1571)
 #9 0x00007f09e0116a8c llvm::FunctionComparator::cmpGEPs(llvm::GEPOperator const*, llvm::GEPOperator const*) const (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x1916a8c)
#10 0x00007f09e011712c llvm::FunctionComparator::cmpOperations(llvm::Instruction const*, llvm::Instruction const*, bool&) const (.part.397) FunctionComparator.cpp:0:0
#11 0x00007f09e0117db0 llvm::FunctionComparator::cmpBasicBlocks(llvm::BasicBlock const*, llvm::BasicBlock const*) const (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x1917db0)
#12 0x00007f09e01193ce llvm::FunctionComparator::compare() (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x19193ce)
#13 0x00007f09e0859e56 (anonymous namespace)::MergeFunctions::runOnModule(llvm::Module&) MergeFunctions.cpp:0:0
#14 0x00007f09e085c12c llvm::MergeFunctionsPass::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x205c12c)
#15 0x00007f09e2084f5d llvm::detail::PassModel<llvm::Module, llvm::MergeFunctionsPass, llvm::PreservedAnalyses, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x3884f5d)
#16 0x00007f09df4cdeb2 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0xccdeb2)
#17 0x0000000000426e70 llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::StringRef>, llvm::ArrayRef<llvm::PassPlugin>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool) (/home/tim/Julia/src/julia/build/dev/usr/tools/opt+0x426e70)
#18 0x000000000041a703 main (/home/tim/Julia/src/julia/build/dev/usr/tools/opt+0x41a703)
#19 0x00007f09de239850 (/usr/lib/libc.so.6+0x23850)
#20 0x00007f09de23990a __libc_start_main (/usr/lib/libc.so.6+0x2390a)
#21 0x000000000041a992 _start /workspace/srcdir/glibc-2.12.2/csu/../sysdeps/x86_64/elf/start.S:116:0
zsh: IOT instruction  /home/tim/Julia/src/julia/build/dev/usr/tools/opt wip.ll --mergefunc -o

MWE:

using CUDA

function kernel_a(x::Bool)
    @cuprint("a ")
    @cuda dynamic=true kernel_b(x)
    return
end

function kernel_b(x::Bool)
    @cuprint("b ")
    @cuda dynamic=true kernel_c(x)
    return
end

function kernel_c(x::Bool)
    @cuprint("c ")
    return
end

@cuda kernel_a(true)

This is the post-opt IR, which fails during function merging:

source_filename = "start"
target datalayout = "e-p:64:64:64:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

%0 = type { i64 }
%1 = type { i64, i64, i64 }

@global = private unnamed_addr constant [3 x i8] c"a \00", align 1
@global1 = private unnamed_addr constant [3 x i8] c"b \00", align 1
@global2 = private unnamed_addr constant [75 x i8] c"ERROR: a CUDA error was thrown during kernel execution: %s (code %ld, %s)\0A\00", align 1
@global3 = private unnamed_addr constant [3 x i8] c"c \00", align 1
@global4 = private unnamed_addr constant [108 x i8] c"ERROR: a %s was thrown during kernel execution.\0A       Run Julia on debug level 2 for device stack traces.\0A\00", align 1
@global5 = private unnamed_addr constant [110 x i8] c"WARNING: could not signal exception status to the host, execution will continue.\0A         Please file a bug.\0A\00", align 1
@global6 = private unnamed_addr constant [10 x i8] c"exception\00", align 1

declare i64 @snork(i32) local_unnamed_addr

declare i64 @wobble(i32) local_unnamed_addr

declare i32 @vprintf(i8*, i8*) local_unnamed_addr

declare i64 @snork7(i64, { i32, i32, i32 }, { i32, i32, i32 }, i32) local_unnamed_addr

declare i32 @widget(i64, i64) local_unnamed_addr

; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #0

; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #0

define internal void @bar(i64 zeroext %arg) local_unnamed_addr #1 {
bb:
  %tmp = alloca %0, align 8
  %tmp1 = bitcast %0* %tmp to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %tmp1)
  %tmp2 = getelementptr inbounds %0, %0* %tmp, i32 0, i32 0
  store i64 %arg, i64* %tmp2, align 8
  %tmp3 = call i32 @vprintf(i8* getelementptr inbounds ([108 x i8], [108 x i8]* @global4, i32 0, i32 0), i8* nonnull %tmp1)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %tmp1)
  ret void
}

; Function Attrs: nocallback nounwind
declare void @llvm.nvvm.membar.sys() #2

; Function Attrs: noinline noreturn
define internal fastcc void @wombat([1 x i64] %arg, [1 x i32]* nocapture noundef nonnull readonly align 4 dereferenceable(4) %arg1) unnamed_addr #3 {
bb:
  %tmp = alloca %1, align 8
  %tmp2 = getelementptr inbounds [1 x i32], [1 x i32]* %arg1, i32 0, i32 0
  %tmp3 = load i32, i32* %tmp2, align 4, !tbaa !5, !alias.scope !9, !noalias !12
  %tmp4 = call i64 @snork(i32 %tmp3)
  %tmp5 = zext i32 %tmp3 to i64
  %tmp6 = call i64 @wobble(i32 %tmp3)
  %tmp7 = bitcast %1* %tmp to i8*
  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %tmp7)
  %tmp8 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 0
  store i64 %tmp4, i64* %tmp8, align 8
  %tmp9 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 1
  store i64 %tmp5, i64* %tmp9, align 8
  %tmp10 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 2
  store i64 %tmp6, i64* %tmp10, align 8
  %tmp11 = call i32 @vprintf(i8* getelementptr inbounds ([75 x i8], [75 x i8]* @global2, i32 0, i32 0), i8* nonnull %tmp7)
  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %tmp7)
  call void @bar(i64 ptrtoint ([10 x i8]* @global6 to i64))
  call void @blam([1 x i64] %arg)
  call void asm sideeffect "exit;", ""() #4
  unreachable
}

define ptx_kernel void @wombat8([1 x i64] %arg, i8 zeroext %arg1) local_unnamed_addr #1 {
bb:
  %tmp = alloca [1 x i32], align 4
  %tmp2 = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @global1, i32 0, i32 0), i8* null)
  %tmp3 = extractvalue [1 x i64] %arg, 0
  %tmp4 = call i64 @snork7(i64 ptrtoint (void ([1 x i64], i8)* @blam9 to i64), { i32, i32, i32 } { i32 1, i32 1, i32 1 }, { i32, i32, i32 } { i32 1, i32 1, i32 1 }, i32 0)
  %tmp5 = inttoptr i64 %tmp4 to [1 x i64]*
  %tmp6 = getelementptr inbounds [1 x i64], [1 x i64]* %tmp5, i32 0, i32 0
  store i64 %tmp3, i64* %tmp6, align 8, !tbaa !17, !alias.scope !19, !noalias !20
  %tmp7 = inttoptr i64 %tmp4 to i8*
  %tmp8 = getelementptr i8, i8* %tmp7, i32 8
  store i8 %arg1, i8* %tmp8, align 1, !tbaa !17, !alias.scope !19, !noalias !20
  %tmp9 = call i32 @widget(i64 %tmp4, i64 0)
  %tmp10 = icmp eq i32 %tmp9, 0
  br i1 %tmp10, label %bb13, label %bb11

bb11:                                             ; preds = %bb
  %tmp12 = getelementptr inbounds [1 x i32], [1 x i32]* %tmp, i32 0, i32 0
  store i32 %tmp9, i32* %tmp12, align 4, !tbaa !21, !alias.scope !23, !noalias !24
  call fastcc void @eggs([1 x i64] %arg, [1 x i32]* %tmp)
  unreachable

bb13:                                             ; preds = %bb
  ret void
}

define internal void @blam([1 x i64] %arg) local_unnamed_addr #1 {
bb:
  %tmp = extractvalue [1 x i64] %arg, 0
  %tmp1 = icmp eq i64 %tmp, 0
  br i1 %tmp1, label %bb4, label %bb2

bb2:                                              ; preds = %bb
  %tmp3 = inttoptr i64 %tmp to i64*
  store i64 1, i64* %tmp3, align 1, !tbaa !17, !alias.scope !19, !noalias !20
  call void @llvm.nvvm.membar.sys()
  br label %bb6

bb4:                                              ; preds = %bb
  %tmp5 = call i32 @vprintf(i8* getelementptr inbounds ([110 x i8], [110 x i8]* @global5, i32 0, i32 0), i8* null)
  br label %bb6

bb6:                                              ; preds = %bb4, %bb2
  ret void
}

define ptx_kernel void @foo([1 x i64] %arg, i8 zeroext %arg1) local_unnamed_addr #1 {
bb:
  %tmp = alloca [1 x i32], align 4
  %tmp2 = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @global, i32 0, i32 0), i8* null)
  %tmp3 = extractvalue [1 x i64] %arg, 0
  %tmp4 = call i64 @snork7(i64 ptrtoint (void ([1 x i64], i8)* @wombat8 to i64), { i32, i32, i32 } { i32 1, i32 1, i32 1 }, { i32, i32, i32 } { i32 1, i32 1, i32 1 }, i32 0)
  %tmp5 = inttoptr i64 %tmp4 to [1 x i64]*
  %tmp6 = getelementptr inbounds [1 x i64], [1 x i64]* %tmp5, i32 0, i32 0
  store i64 %tmp3, i64* %tmp6, align 8, !tbaa !17, !alias.scope !19, !noalias !20
  %tmp7 = inttoptr i64 %tmp4 to i8*
  %tmp8 = getelementptr i8, i8* %tmp7, i32 8
  store i8 %arg1, i8* %tmp8, align 1, !tbaa !17, !alias.scope !19, !noalias !20
  %tmp9 = call i32 @widget(i64 %tmp4, i64 0)
  %tmp10 = icmp eq i32 %tmp9, 0
  br i1 %tmp10, label %bb13, label %bb11

bb11:                                             ; preds = %bb
  %tmp12 = getelementptr inbounds [1 x i32], [1 x i32]* %tmp, i32 0, i32 0
  store i32 %tmp9, i32* %tmp12, align 4, !tbaa !21, !alias.scope !23, !noalias !24
  call fastcc void @wombat([1 x i64] %arg, [1 x i32]* %tmp)
  unreachable

bb13:                                             ; preds = %bb
  ret void
}

; Function Attrs: noinline noreturn
define internal fastcc void @eggs([1 x i64] %arg, [1 x i32]* nocapture noundef nonnull readonly align 4 dereferenceable(4) %arg1) unnamed_addr #3 {
bb:
  %tmp = alloca %1, align 8
  %tmp2 = getelementptr inbounds [1 x i32], [1 x i32]* %arg1, i32 0, i32 0
  %tmp3 = load i32, i32* %tmp2, align 4, !tbaa !5, !alias.scope !9, !noalias !12
  %tmp4 = call i64 @snork(i32 %tmp3)
  %tmp5 = zext i32 %tmp3 to i64
  %tmp6 = call i64 @wobble(i32 %tmp3)
  %tmp7 = bitcast %1* %tmp to i8*
  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %tmp7)
  %tmp8 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 0
  store i64 %tmp4, i64* %tmp8, align 8
  %tmp9 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 1
  store i64 %tmp5, i64* %tmp9, align 8
  %tmp10 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 2
  store i64 %tmp6, i64* %tmp10, align 8
  %tmp11 = call i32 @vprintf(i8* getelementptr inbounds ([75 x i8], [75 x i8]* @global2, i32 0, i32 0), i8* nonnull %tmp7)
  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %tmp7)
  call void @bar(i64 ptrtoint ([10 x i8]* @global6 to i64))
  call void @blam([1 x i64] %arg)
  call void asm sideeffect "exit;", ""() #4
  unreachable
}

define ptx_kernel void @blam9([1 x i64] %arg, i8 zeroext %arg1) local_unnamed_addr #1 {
bb:
  %tmp = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @global3, i32 0, i32 0), i8* null)
  ret void
}

attributes #0 = { argmemonly nocallback nofree nosync nounwind willreturn }
attributes #1 = { "probe-stack"="inline-asm" }
attributes #2 = { nocallback nounwind }
attributes #3 = { noinline noreturn "probe-stack"="inline-asm" }
attributes #4 = { nounwind }

!llvm.module.flags = !{!0, !1}
!julia.kernel = !{!2, !3, !4}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{void ([1 x i64], i8)* @wombat8}
!3 = !{void ([1 x i64], i8)* @blam9}
!4 = !{void ([1 x i64], i8)* @foo}
!5 = !{!6, !6, i64 0}
!6 = !{!"jtbaa_const", !7, i64 0}
!7 = !{!"jtbaa", !8, i64 0}
!8 = !{!"jtbaa"}
!9 = !{!10}
!10 = !{!"jnoalias_const", !11}
!11 = !{!"jnoalias"}
!12 = !{!13, !14, !15, !16}
!13 = !{!"jnoalias_gcframe", !11}
!14 = !{!"jnoalias_stack", !11}
!15 = !{!"jnoalias_data", !11}
!16 = !{!"jnoalias_typemd", !11}
!17 = !{!18, !18, i64 0}
!18 = !{!"jtbaa_data", !7, i64 0}
!19 = !{!15}
!20 = !{!13, !14, !16, !10}
!21 = !{!22, !22, i64 0}
!22 = !{!"jtbaa_stack", !7, i64 0}
!23 = !{!14}
!24 = !{!13, !15, !16, !10}

Or, reduced:

target datalayout = "e-p:64:64:64:32"
target triple = "nvptx64-nvidia-cuda"

%0 = type { i64 }

define internal fastcc void @foo(%0* %tmp) {
bb:
  %tmp1 = getelementptr inbounds %0, %0* %tmp, i32 0, i32 0
  unreachable
}

define internal fastcc void @bar(%0* %tmp) {
bb:
  %tmp1 = getelementptr inbounds %0, %0* %tmp, i32 0, i32 0
  unreachable
}

I guess this DL modification was illegal? I still need to take a closer look.

The text was updated successfully, but these errors were encountered:

maleadt · 2023-05-31T11:18:32Z

Doesn't happen on ToT, fix bisected to https://reviews.llvm.org/D143437.

maleadt · 2023-05-31T11:21:51Z

Seeing:

bool GEPOperator::accumulateConstantOffset(
    const DataLayout &DL, APInt &Offset,
    function_ref<bool(Value &, APInt &)> ExternalAnalysis) const {
  assert(Offset.getBitWidth() ==
             DL.getIndexSizeInBits(getPointerAddressSpace()) &&
         "The offset bit width does not match DL specification.");

... I guess we can't just set the pointer index size to 32 bits without breaking 64 bits offsets (which may be emitted by Julia, and which we still want to support). Maybe the alternative is an optimization pass that tries to demote GEP indices to 32 bits, if possible. In any case, let's revert the DL change.

maleadt added the ptx Stuff about the NVIDIA PTX back-end. label May 26, 2023

This was referenced May 31, 2023

PTX: Don't use a 32-bit pointer index type. #462

Merged

PTX: Demote GEP indices to 32 bits, if possible. #463

Open

maleadt closed this as completed in #462 May 31, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

PTX: LLVM assertion error during function merging #461

PTX: LLVM assertion error during function merging #461

maleadt commented May 26, 2023 •

edited

Loading

maleadt commented May 31, 2023

maleadt commented May 31, 2023

PTX: LLVM assertion error during function merging #461

PTX: LLVM assertion error during function merging #461

Comments

maleadt commented May 26, 2023 • edited Loading

maleadt commented May 31, 2023

maleadt commented May 31, 2023

maleadt commented May 26, 2023 •

edited

Loading