-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Numba-dpex implementation of Rambo fails validation #978
Comments
Reproducer: import dpnp
import numba as nb
from numba_dpex import dpjit
import numpy
dpnp.pi = numpy.pi
def rambo_ref(nevts, nout, C1, F1, Q1, output):
C = 2.0 * C1 - 1.0
S = numpy.sqrt(1 - numpy.square(C))
F = 2.0 * numpy.pi * F1
Q = -numpy.log(Q1)
output[:, :, 0] = Q
output[:, :, 1] = Q * S * numpy.sin(F)
output[:, :, 2] = Q * S * numpy.cos(F)
output[:, :, 3] = Q * C
@dpjit
def rambo_dpjit(nevts, nout, C1, F1, Q1, output):
C = 2.0 * C1 - 1.0
S = dpnp.sqrt(1 - dpnp.square(C))
F = 2.0 * dpnp.pi * F1
Q = -dpnp.log(Q1)
output[:, :, 0] = Q
output[:, :, 1] = Q * S * dpnp.sin(F)
output[:, :, 2] = Q * S * dpnp.cos(F)
output[:, :, 3] = Q * C
@nb.njit(parallel=True)
def rambo_njit(nevts, nout, C1, F1, Q1, output):
C = 2.0 * C1 - 1.0
S = numpy.sqrt(1 - numpy.square(C))
F = 2.0 * numpy.pi * F1
Q = -numpy.log(Q1)
output[:, :, 0] = Q
output[:, :, 1] = Q * S * numpy.sin(F)
output[:, :, 2] = Q * S * numpy.cos(F)
output[:, :, 3] = Q * C
def initialize(nevts, nout):
C1 = numpy.empty((nevts, nout))
F1 = numpy.empty((nevts, nout))
Q1 = numpy.empty((nevts, nout))
numpy.random.seed(777)
for i in range(nevts):
for j in range(nout):
C1[i, j] = numpy.random.rand()
F1[i, j] = numpy.random.rand()
Q1[i, j] = numpy.random.rand() * numpy.random.rand()
return (
C1,
F1,
Q1,
numpy.empty((nevts, nout, 4)),
)
def test_rambo(nevts=32768, nout=4):
C1,F1,Q1, output = initialize(nevts, nout)
output_n = numpy.copy(output)
# Copy C1, F1,Q1 to dpnp
C1d = dpnp.asarray(C1)
F1d = dpnp.asarray(F1)
Q1d = dpnp.asarray(Q1)
output_d = dpnp.asarray(output)
rambo_ref(nevts, nout, C1, F1, Q1, output)
rambo_njit(nevts, nout, C1, F1, Q1, output_n)
rambo_dpjit(nevts, nout, C1d, F1d, Q1d, output_d)
return output, output_n, output_d
output, output_n, output_d = test_rambo()
print(numpy.allclose(output[:, :, 0], output_d[:,:,0]))
print(numpy.allclose(output[:, :, 1], output_d[:,:,1]))
print(numpy.allclose(output[:, :, 2], output_d[:,:,2]))
print(numpy.allclose(output[:, :, 3], output_d[:,:,3])) |
The following is an abridged version of the Numba IR generated for the
The issue stems from incorrect index computation for the output array. The index is computed using just
|
On further analysis, for a parfor the slicing operation of
In dpjit's case, either the slice is done incorrectly or the sliced arrays are passed in to the parfor incorrectly. |
A minimal reproducer import dpnp
import numba as nb
from numba_dpex import dpjit
import numpy
@dpjit
def foo(a):
return a[1:5]
a = dpnp.arange(10)
b = foo(a)
print("Dpnp input ", a)
print("Dpnp slice [1:5] ", b)
@nb.njit
def bar(a):
return a[1:5]
na = numpy.arange(10)
nb = bar(na)
print("NumPy input ", na)
print("NumPy slice [1:5] ", nb) Output
|
LLVM IR generated for the ; ModuleID = 'foo.ll'
source_filename = "foo.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@_ZN08NumbaEnv8__main__3fooB2v1B1cB2_8B1tB1JB1TB1CB3_2fB1WB1QB1AB1lB1bB1WB2_1B1yB1BB1CB2_0B1oB1RB2_6B1GB1EB1LB1EB1UB1MB1EB1LB1YB1SB1PB1GB1rB1IB1QB1MB1VB1jB1AB1QB1nB1iB1QB1cB1IB1XB1KB1QB1IB1MB1VB1wB1oB1OB1GB1KB1oB1QB1DB1DB1VB1QB1QB1RB2_1B1NB1HB1AB1SB2_2B1FB1QB2_9B1XB1gB1SB1sB2_8B1wB1mB2_4B1oB1gB1LB1EB2_0B1AE11DpnpNdArrayIxLi1E1C7mutable7alignedE = common local_unnamed_addr global i8* null
define i32 @_ZN8__main__3foo(
{ i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }* noalias nocapture %retptr,
{ i8*, i32, i8* }** noalias nocapture readnone %excinfo,
i8* %arg.a.0,
i8* %arg.a.1,
i64 %arg.a.2,
i64 %arg.a.3,
i64* %arg.a.4,
i64 %arg.a.5.0,
i64 %arg.a.6.0
) local_unnamed_addr {
entry:
tail call void @NRT_incref(i8* %arg.a.0)
%0 = icmp slt i64 %arg.a.5.0, 1
%.65.sroa.0.2 = select i1 %0, i64 %arg.a.5.0, i64 1, !prof !0
%1 = icmp slt i64 %arg.a.5.0, 5
%.65.sroa.13.0 = select i1 %1, i64 %arg.a.5.0, i64 5, !prof !0
%.153 = sub i64 %.65.sroa.13.0, %.65.sroa.0.2
%.160.inv = icmp sgt i64 %.153, 0
%.162 = select i1 %.160.inv, i64 %.153, i64 0
%.178 = getelementptr i64, i64* %arg.a.4, i64 %.65.sroa.0.2
tail call void @NRT_incref(i8* %arg.a.0)
tail call void @NRT_decref(i8* null)
tail call void @NRT_decref(i8* %arg.a.0)
tail call void @NRT_incref(i8* %arg.a.0)
tail call void @NRT_decref(i8* null)
tail call void @NRT_decref(i8* %arg.a.0)
%retptr.repack = getelementptr inbounds { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 0
store i8* %arg.a.0, i8** %retptr.repack, align 8
%retptr.repack73 = getelementptr inbounds { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 1
store i8* %arg.a.1, i8** %retptr.repack73, align 8
%retptr.repack75 = getelementptr inbounds { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 2
store i64 %.162, i64* %retptr.repack75, align 8
%retptr.repack77 = getelementptr inbounds { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 3
store i64 %arg.a.3, i64* %retptr.repack77, align 8
%retptr.repack79 = getelementptr inbounds { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 4
store i64* %.178, i64** %retptr.repack79, align 8
%2 = getelementptr inbounds { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 5, i64 0
store i64 %.162, i64* %2, align 8
%3 = getelementptr inbounds { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, i64*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 6, i64 0
store i64 %arg.a.6.0, i64* %3, align 8
ret i32 0
}
declare void @NRT_incref(i8* noalias nocapture) local_unnamed_addr
declare void @NRT_decref(i8* noalias nocapture) local_unnamed_addr
!0 = !{!"branch_weights", i32 1, i32 99}
The relevant lines of code are:
All these look fine at first glace, with the only caveat that |
The above issue was caused by a bug in unboxing of a dpnp.ndarray and has been fixed by: |
One more reproducer: import dpnp
from numba_dpex import dpjit
def foo2(a,b):
b[:,:,0] = a
a1 = dpnp.arange(64)
a1 = a1.reshape(4,16)
b1 = dpnp.empty((4,16,4))
dpjit(foo2)(a1, b1)
print(b1) The above example still produces incorrect results even after c367015. The reason seems to be incorrect indexing in the kernel generated for the expression |
@adarshyoga @mingjie-intel I am yet to fully confirm, but here is the most likely scenario causing the bug:
|
Rambo workload's numba-dpex implementation with dpnp calls fails validation. In my initial analysis, I see that the results produced are different to what the numpy/python implementation produces. The numba-dpex prange implementation passes validation. So this problem seems to be specific to dpnp calls.
How to reproduce:
Follow instructions to setup dpbench
Run Rambo - python -c "import dpbench; dpbench.run_benchmark("rambo")
The text was updated successfully, but these errors were encountered: