Cuda: fix for bug issue kokkos#125

Fixes an issue with cuda_get_max_block_size and cuda_get_opt_block_size. This makes the choice of constant vs local memory a template parameter defaulted by the size of the existing DriverType template parameter. It also changes the interface by adding a new shmem_extra argument which is required for lambdas since the functor in those cases doesn't have a shmem size function. Both functions are part of the impl namespace and thus not public yet.
hcedwar · Nov 12, 2015 · 0415dc6 · 0415dc6
1 parent 28683f7
commit 0415dc6
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 14 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@@ -54,17 +54,16 @@
 namespace Kokkos { namespace Impl {
 
 
-template<class DriverType>
-int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
+template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
+int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t shmem_extra) {
 #if ( CUDA_VERSION < 6050 )
   return 256;
 #else
-  bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
 
   int numBlocks;
   if(Large) {
     int blockSize=32;
-    int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+    int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &numBlocks,
         cuda_parallel_launch_constant_memory<DriverType>,
@@ -73,7 +72,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
 
     while (blockSize<1024 && numBlocks>0) {
       blockSize*=2;
-      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
 
       cudaOccupancyMaxActiveBlocksPerMultiprocessor(
           &numBlocks,
@@ -85,7 +84,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
     else return blockSize/2;
   } else {
     int blockSize=32;
-    int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+    int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &numBlocks,
         cuda_parallel_launch_local_memory<DriverType>,
@@ -94,7 +93,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
 
     while (blockSize<1024 && numBlocks>0) {
       blockSize*=2;
-      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
 
       cudaOccupancyMaxActiveBlocksPerMultiprocessor(
           &numBlocks,
@@ -108,12 +107,11 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
 #endif
 }
 
-template<class DriverType>
-int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
+template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
+int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t shmem_extra) {
 #if ( CUDA_VERSION < 6050 )
   return 256;
 #else
-  bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
 
   int blockSize=16;
   int numBlocks;
@@ -126,7 +124,7 @@ int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
       blockSize*=2;
 
       //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
-      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
       cudaOccupancyMaxActiveBlocksPerMultiprocessor(
               &numBlocks,
               cuda_parallel_launch_constant_memory<DriverType>,
@@ -140,7 +138,7 @@ int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
   } else {
     while(blockSize<1024) {
       blockSize*=2;
-      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
 
       cudaOccupancyMaxActiveBlocksPerMultiprocessor(
               &numBlocks,

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -593,7 +593,7 @@ class ParallelFor< FunctorType
     : m_functor( arg_functor )
     , m_league_size( arg_policy.league_size() )
     , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-        Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor ) / arg_policy.vector_length() )
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.scratch_size() ) / arg_policy.vector_length() )
     , m_vector_size( arg_policy.vector_length() )
     , m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) )
     , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
@@ -946,7 +946,7 @@ class ParallelReduce< FunctorType
   , m_shmem_size( 0 )
   , m_league_size( arg_policy.league_size() )
   , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor ) / arg_policy.vector_length() )
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.scratch_size() ) / arg_policy.vector_length() )
   {
     // Return Init value if the number of worksets is zero
     if( arg_policy.league_size() == 0) {