Revert adoption of CUDA indexing in general - this can only work with…

… later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions.
ROCm · Nov 29, 2017 · d2fd1f5 · d2fd1f5
1 parent b881cf7
commit d2fd1f5
Show file tree

Hide file tree

Showing 63 changed files with 175 additions and 173 deletions.
diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h
@@ -381,27 +381,29 @@ __device__ void  __threadfence_system(void) ;
  * @}
  */
 
-template<typename std::common_type<
-    decltype(hc_get_group_id),
-    decltype(hc_get_group_size),
-    decltype(hc_get_num_groups),
-    decltype(hc_get_workitem_id)>::type f>
-class Coordinates {
-    using R = decltype(f(0));
-
-    struct X { __device__ operator R() const { return f(0); } };
-    struct Y { __device__ operator R() const { return f(1); } };
-    struct Z { __device__ operator R() const { return f(2); } };
-public:
-    static constexpr X x{};
-    static constexpr Y y{};
-    static constexpr Z z{};
-};
-
-static constexpr Coordinates<hc_get_group_size> blockDim;
-static constexpr Coordinates<hc_get_group_id> blockIdx;
-static constexpr Coordinates<hc_get_num_groups> gridDim;
-static constexpr Coordinates<hc_get_workitem_id> threadIdx;
+#if __hcc_workweek__ >= 17481
+    template<typename std::common_type<
+        decltype(hc_get_group_id),
+        decltype(hc_get_group_size),
+        decltype(hc_get_num_groups),
+        decltype(hc_get_workitem_id)>::type f>
+    class Coordinates {
+        using R = decltype(f(0));
+
+        struct X { __device__ operator R() const { return f(0); } };
+        struct Y { __device__ operator R() const { return f(1); } };
+        struct Z { __device__ operator R() const { return f(2); } };
+    public:
+        static constexpr X x{};
+        static constexpr Y y{};
+        static constexpr Z z{};
+    };
+
+    static constexpr Coordinates<hc_get_group_size> blockDim;
+    static constexpr Coordinates<hc_get_group_id> blockIdx;
+    static constexpr Coordinates<hc_get_num_groups> gridDim;
+    static constexpr Coordinates<hc_get_workitem_id> threadIdx;
+#endif
 
 #define hipThreadIdx_x (hc_get_workitem_id(0))
 #define hipThreadIdx_y (hc_get_workitem_id(1))

diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu
@@ -40,8 +40,8 @@ template <typename T>
 __global__ void
 vector_square(T *C_d, const T *A_d, size_t N)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (size_t i=offset; i<N; i+=stride) {
         C_d[i] = A_d[i] * A_d[i];

diff --git a/src/device_util.cpp b/src/device_util.cpp
@@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
     {
         return (void*)nullptr;
     }
-    uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
-    uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
+    uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
+    uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
 
     uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
     uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;

diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
@@ -1307,9 +1307,9 @@ namespace
     __global__
     void hip_fill_n(RandomAccessIterator f, N n, T value)
     {
-        const uint32_t grid_dim = gridDim.x * blockDim.x;
+        const uint32_t grid_dim = hipGridDim_x * hipBlockDim_x;
 
-        size_t idx = blockIdx.x * block_dim + threadIdx.x;
+        size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
         while (idx < n) {
             __builtin_memcpy(
                 reinterpret_cast<void*>(&f[idx]),

diff --git a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define NUM_STREAMS 2
 
 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related behavior below
     if(tx == 0){
         for(int i = 0; i<num;i++){

diff --git a/tests/src/deviceLib/hipComplex.cpp b/tests/src/deviceLib/hipComplex.cpp
@@ -27,7 +27,7 @@ THE SOFTWARE.
 #define SIZE 64<<2
 
 __global__  void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
 }
 

diff --git a/tests/src/deviceLib/hipDeviceMemcpy.cpp b/tests/src/deviceLib/hipDeviceMemcpy.cpp
@@ -16,13 +16,13 @@
 
 __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
 {
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
     memcpy(Out + tx, In + tx, sizeof(uint32_t));
 }
 
 __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
 {
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
     memset(ptr + tx, val, sizeof(uint32_t));
 }
 

diff --git a/tests/src/deviceLib/hipFloatMath.cpp b/tests/src/deviceLib/hipFloatMath.cpp
@@ -35,7 +35,7 @@ THE SOFTWARE.
 
 
 __global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   Out[tid] = __cosf(In[tid]);
   Out[tid] = __exp10f(Out[tid]);
   Out[tid] = __expf(Out[tid]);

diff --git a/tests/src/deviceLib/hipSimpleAtomicsTest.cpp b/tests/src/deviceLib/hipSimpleAtomicsTest.cpp
@@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
 __global__ void testKernel(hipLaunchParm lp,int *g_odata)
 {
     // access thread id
-    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
 
     // Test various atomic instructions
 

diff --git a/tests/src/deviceLib/hipTestDevice.cpp b/tests/src/deviceLib/hipTestDevice.cpp
@@ -32,82 +32,82 @@ THE SOFTWARE.
 #define SIZE N*sizeof(float)
 
 __global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     sincosf(a[tid], b+tid, c+tid);
 }
 
 __global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     sincospif(a[tid], b+tid, c+tid);
 }
 
 __global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     c[tid] = fdividef(a[tid], b[tid]);
 }
 
 __global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = llrintf(a[tid]);
 }
 
 __global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = lrintf(a[tid]);
 }
 
 __global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = rintf(a[tid]);
 }
 
 __global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = llroundf(a[tid]);
 }
 
 __global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = lroundf(a[tid]);
 }
 
 __global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     c[tid] = rhypotf(a[tid], b[tid]);
 }
 
 __global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     d[tid] = norm3df(a[tid], b[tid], c[tid]);
 }
 
 __global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
 }
 
 __global__ void test_normf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = normf(N, a);
 }
 
 __global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
 }
 
 __global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
 }
 
 __global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = rnormf(N, a);
 }
 
 __global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = erff(erfinvf(a[tid]));
 }
 

diff --git a/tests/src/deviceLib/hipTestDeviceDouble.cpp b/tests/src/deviceLib/hipTestDeviceDouble.cpp
@@ -32,72 +32,72 @@ THE SOFTWARE.
 #define SIZE N*sizeof(double)
 
 __global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     sincos(a[tid], b+tid, c+tid);
 }
 
 __global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     sincospi(a[tid], b+tid, c+tid);
 }
 
 __global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = llrint(a[tid]);
 }
 
 __global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = lrint(a[tid]);
 }
 
 __global__ void test_rint(hipLaunchParm lp, double *a, double *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = rint(a[tid]);
 }
 
 __global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = llround(a[tid]);
 }
 
 __global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = lround(a[tid]);
 }
 
 __global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     c[tid] = rhypot(a[tid], b[tid]);
 }
 
 __global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     d[tid] = norm3d(a[tid], b[tid], c[tid]);
 }
 
 __global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
 }
 
 __global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
 }
 
 __global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
 }
 
 __global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = rnorm(N, a);
 }
 
 __global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = erf(erfinv(a[tid]));
 }
 

diff --git a/tests/src/deviceLib/hipTestDeviceSymbol.cpp b/tests/src/deviceLib/hipTestDeviceSymbol.cpp
@@ -36,7 +36,7 @@ __device__ int globalOut[NUM];
 
 __global__ void Assign(hipLaunchParm lp, int* Out)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     Out[tid] = globalIn[tid];
     globalOut[tid] = globalIn[tid];
 }

diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__
 
 __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
   __half a = A[tx];
   __half b = B[tx];
   __half c = C[tx];
@@ -45,7 +45,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
 }
 
 __global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
   __half2 a = A[tx];
   __half2 b = B[tx];
   __half2 c = C[tx];

diff --git a/tests/src/deviceLib/hipThreadFence.cpp b/tests/src/deviceLib/hipThreadFence.cpp
@@ -33,7 +33,7 @@ THE SOFTWARE.
 
 __global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     In4[tid] = In1[tid] + In2[tid];
     __threadfence();
     In3[tid] = In3[tid] + In4[tid];

diff --git a/tests/src/deviceLib/hip_anyall.cpp b/tests/src/deviceLib/hip_anyall.cpp
@@ -37,9 +37,9 @@ __global__ void
 	warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
 {
 
-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   device_any[threadIdx.x>>pshift] = __any(tid -77);
-   device_all[threadIdx.x>>pshift] = __all(tid -77);
+   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+   device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
+   device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
 }
 
 int main(int argc, char *argv[])

diff --git a/tests/src/deviceLib/hip_ballot.cpp b/tests/src/deviceLib/hip_ballot.cpp
@@ -34,12 +34,12 @@ __global__ void
 	gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
 {
 
-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   const unsigned int warp_num = threadIdx.x >> pshift;
+   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+   const unsigned int warp_num = hipThreadIdx_x >> pshift;
 #ifdef __HIP_PLATFORM_HCC__
-   atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
+   atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
 #else
-	atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
+	atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
 #endif
 
 }

diff --git a/tests/src/deviceLib/hip_brev.cpp b/tests/src/deviceLib/hip_brev.cpp
@@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
              unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
   {
 
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {