diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 1d61c9c5906..870284b3723 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -345,6 +345,9 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { m_scratchFlags = static_cast( mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_CUDA_SAFE_CALL( (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size))); } diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 3b5a1e0017c..d9fb99f1751 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -226,6 +226,9 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { m_scratchFlags = static_cast( mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size)); } diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 05b50d52534..f05deab54b0 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -288,12 +288,16 @@ sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { m_scratchFlagsCount, sizeScratchGrain); m_scratchFlags = static_cast(mem_space.allocate( "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); - } - auto memset_event = m_queue->memset(m_scratchFlags, 0, - m_scratchFlagsCount * sizeScratchGrain); + + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. + auto memset_event = m_queue->memset(m_scratchFlags, 0, + m_scratchFlagsCount * sizeScratchGrain); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); + m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); #endif + } return m_scratchFlags; } diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 953d2235b31..f55280e22e3 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -234,6 +234,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_mem[local_id * value_count]); else { @@ -279,6 +280,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_value); else { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 7f29dcf9d9c..5333e3c8a83 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -177,6 +177,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -219,6 +220,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_value); else { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 01819b848af..27165c59e3a 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -229,6 +229,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -281,6 +282,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_value); else { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index d6f3219defd..977b69bc9eb 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -187,6 +187,7 @@ class ParallelScanSYCLBase { } item.barrier(sycl::access::fence_space::global_space); if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; value_type total; reducer.init(&total);