Skip to content

Commit

Permalink
Cuda: fix for bug issue kokkos#125
Browse files Browse the repository at this point in the history
Fixes an issue with cuda_get_max_block_size and cuda_get_opt_block_size.
This makes the choice of constant vs local memory a template parameter
defaulted by the size of the existing DriverType template parameter.
It also changes the interface by adding a new shmem_extra argument which is
required for lambdas since the functor in those cases doesn't have a
shmem size function.

Both functions are part of the impl namespace and thus not public yet.
  • Loading branch information
crtrott authored and hcedwar committed Nov 12, 2015
1 parent 28683f7 commit 0415dc6
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 14 deletions.
22 changes: 10 additions & 12 deletions core/src/Cuda/Kokkos_Cuda_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,16 @@
namespace Kokkos { namespace Impl {


template<class DriverType>
int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t shmem_extra) {
#if ( CUDA_VERSION < 6050 )
return 256;
#else
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );

int numBlocks;
if(Large) {
int blockSize=32;
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
Expand All @@ -73,7 +72,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {

while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );

cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
Expand All @@ -85,7 +84,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
else return blockSize/2;
} else {
int blockSize=32;
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType>,
Expand All @@ -94,7 +93,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {

while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );

cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
Expand All @@ -108,12 +107,11 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
#endif
}

template<class DriverType>
int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t shmem_extra) {
#if ( CUDA_VERSION < 6050 )
return 256;
#else
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );

int blockSize=16;
int numBlocks;
Expand All @@ -126,7 +124,7 @@ int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
blockSize*=2;

//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
Expand All @@ -140,7 +138,7 @@ int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
} else {
while(blockSize<1024) {
blockSize*=2;
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );

cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
Expand Down
4 changes: 2 additions & 2 deletions core/src/Cuda/Kokkos_Cuda_Parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,7 @@ class ParallelFor< FunctorType
: m_functor( arg_functor )
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor ) / arg_policy.vector_length() )
Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.scratch_size() ) / arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
, m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) )
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
Expand Down Expand Up @@ -946,7 +946,7 @@ class ParallelReduce< FunctorType
, m_shmem_size( 0 )
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor ) / arg_policy.vector_length() )
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.scratch_size() ) / arg_policy.vector_length() )
{
// Return Init value if the number of worksets is zero
if( arg_policy.league_size() == 0) {
Expand Down

0 comments on commit 0415dc6

Please sign in to comment.