Skip to content

Commit

Permalink
Revert adoption of CUDA indexing in general - this can only work with…
Browse files Browse the repository at this point in the history
… later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions.
  • Loading branch information
AlexVlx committed Nov 29, 2017
1 parent b881cf7 commit d2fd1f5
Show file tree
Hide file tree
Showing 63 changed files with 175 additions and 173 deletions.
44 changes: 23 additions & 21 deletions include/hip/hcc_detail/hip_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -381,27 +381,29 @@ __device__ void __threadfence_system(void) ;
* @}
*/

template<typename std::common_type<
decltype(hc_get_group_id),
decltype(hc_get_group_size),
decltype(hc_get_num_groups),
decltype(hc_get_workitem_id)>::type f>
class Coordinates {
using R = decltype(f(0));

struct X { __device__ operator R() const { return f(0); } };
struct Y { __device__ operator R() const { return f(1); } };
struct Z { __device__ operator R() const { return f(2); } };
public:
static constexpr X x{};
static constexpr Y y{};
static constexpr Z z{};
};

static constexpr Coordinates<hc_get_group_size> blockDim;
static constexpr Coordinates<hc_get_group_id> blockIdx;
static constexpr Coordinates<hc_get_num_groups> gridDim;
static constexpr Coordinates<hc_get_workitem_id> threadIdx;
#if __hcc_workweek__ >= 17481
template<typename std::common_type<
decltype(hc_get_group_id),
decltype(hc_get_group_size),
decltype(hc_get_num_groups),
decltype(hc_get_workitem_id)>::type f>
class Coordinates {
using R = decltype(f(0));

struct X { __device__ operator R() const { return f(0); } };
struct Y { __device__ operator R() const { return f(1); } };
struct Z { __device__ operator R() const { return f(2); } };
public:
static constexpr X x{};
static constexpr Y y{};
static constexpr Z z{};
};

static constexpr Coordinates<hc_get_group_size> blockDim;
static constexpr Coordinates<hc_get_group_id> blockIdx;
static constexpr Coordinates<hc_get_num_groups> gridDim;
static constexpr Coordinates<hc_get_workitem_id> threadIdx;
#endif

#define hipThreadIdx_x (hc_get_workitem_id(0))
#define hipThreadIdx_y (hc_get_workitem_id(1))
Expand Down
4 changes: 2 additions & 2 deletions samples/0_Intro/square/square.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ template <typename T>
__global__ void
vector_square(T *C_d, const T *A_d, size_t N)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;

for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] * A_d[i];
Expand Down
4 changes: 2 additions & 2 deletions src/device_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
{
return (void*)nullptr;
}
uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;

uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
Expand Down
4 changes: 2 additions & 2 deletions src/hip_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1307,9 +1307,9 @@ namespace
__global__
void hip_fill_n(RandomAccessIterator f, N n, T value)
{
const uint32_t grid_dim = gridDim.x * blockDim.x;
const uint32_t grid_dim = hipGridDim_x * hipBlockDim_x;

size_t idx = blockIdx.x * block_dim + threadIdx.x;
size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
while (idx < n) {
__builtin_memcpy(
reinterpret_cast<void*>(&f[idx]),
Expand Down
2 changes: 1 addition & 1 deletion tests/src/Functional/device/hipFuncDeviceSynchronize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ THE SOFTWARE.
#define NUM_STREAMS 2

__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
if(tx == 0){
for(int i = 0; i<num;i++){
Expand Down
2 changes: 1 addition & 1 deletion tests/src/deviceLib/hipComplex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ THE SOFTWARE.
#define SIZE 64<<2

__global__ void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
}

Expand Down
4 changes: 2 additions & 2 deletions tests/src/deviceLib/hipDeviceMemcpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@

__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
{
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
memcpy(Out + tx, In + tx, sizeof(uint32_t));
}

__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
memset(ptr + tx, val, sizeof(uint32_t));
}

Expand Down
2 changes: 1 addition & 1 deletion tests/src/deviceLib/hipFloatMath.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ THE SOFTWARE.


__global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Out[tid] = __cosf(In[tid]);
Out[tid] = __exp10f(Out[tid]);
Out[tid] = __expf(Out[tid]);
Expand Down
2 changes: 1 addition & 1 deletion tests/src/deviceLib/hipSimpleAtomicsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
__global__ void testKernel(hipLaunchParm lp,int *g_odata)
{
// access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

// Test various atomic instructions

Expand Down
32 changes: 16 additions & 16 deletions tests/src/deviceLib/hipTestDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,82 +32,82 @@ THE SOFTWARE.
#define SIZE N*sizeof(float)

__global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincosf(a[tid], b+tid, c+tid);
}

__global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincospif(a[tid], b+tid, c+tid);
}

__global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
c[tid] = fdividef(a[tid], b[tid]);
}

__global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llrintf(a[tid]);
}

__global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lrintf(a[tid]);
}

__global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rintf(a[tid]);
}

__global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llroundf(a[tid]);
}

__global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lroundf(a[tid]);
}

__global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
c[tid] = rhypotf(a[tid], b[tid]);
}

__global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = norm3df(a[tid], b[tid], c[tid]);
}

__global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
}

__global__ void test_normf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = normf(N, a);
}

__global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
}

__global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
}

__global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rnormf(N, a);
}

__global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = erff(erfinvf(a[tid]));
}

Expand Down
28 changes: 14 additions & 14 deletions tests/src/deviceLib/hipTestDeviceDouble.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,72 +32,72 @@ THE SOFTWARE.
#define SIZE N*sizeof(double)

__global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincos(a[tid], b+tid, c+tid);
}

__global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincospi(a[tid], b+tid, c+tid);
}

__global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llrint(a[tid]);
}

__global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lrint(a[tid]);
}

__global__ void test_rint(hipLaunchParm lp, double *a, double *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rint(a[tid]);
}

__global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llround(a[tid]);
}

__global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lround(a[tid]);
}

__global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
c[tid] = rhypot(a[tid], b[tid]);
}

__global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = norm3d(a[tid], b[tid], c[tid]);
}

__global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
}

__global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
}

__global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
}

__global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rnorm(N, a);
}

__global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = erf(erfinv(a[tid]));
}

Expand Down
2 changes: 1 addition & 1 deletion tests/src/deviceLib/hipTestDeviceSymbol.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ __device__ int globalOut[NUM];

__global__ void Assign(hipLaunchParm lp, int* Out)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Out[tid] = globalIn[tid];
globalOut[tid] = globalIn[tid];
}
Expand Down
4 changes: 2 additions & 2 deletions tests/src/deviceLib/hipTestHalf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ THE SOFTWARE.
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__

__global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
__half a = A[tx];
__half b = B[tx];
__half c = C[tx];
Expand All @@ -45,7 +45,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
}

__global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
__half2 a = A[tx];
__half2 b = B[tx];
__half2 c = C[tx];
Expand Down
2 changes: 1 addition & 1 deletion tests/src/deviceLib/hipThreadFence.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ THE SOFTWARE.

__global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
In4[tid] = In1[tid] + In2[tid];
__threadfence();
In3[tid] = In3[tid] + In4[tid];
Expand Down
6 changes: 3 additions & 3 deletions tests/src/deviceLib/hip_anyall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ __global__ void
warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
{

int tid = threadIdx.x + blockIdx.x * blockDim.x;
device_any[threadIdx.x>>pshift] = __any(tid -77);
device_all[threadIdx.x>>pshift] = __all(tid -77);
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
}

int main(int argc, char *argv[])
Expand Down
8 changes: 4 additions & 4 deletions tests/src/deviceLib/hip_ballot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ __global__ void
gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
{

int tid = threadIdx.x + blockIdx.x * blockDim.x;
const unsigned int warp_num = threadIdx.x >> pshift;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
const unsigned int warp_num = hipThreadIdx_x >> pshift;
#ifdef __HIP_PLATFORM_HCC__
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
#else
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
#endif

}
Expand Down
4 changes: 2 additions & 2 deletions tests/src/deviceLib/hip_brev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
{

int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

int i = y * width + x;
if ( i < (width * height)) {
Expand Down
Loading

0 comments on commit d2fd1f5

Please sign in to comment.