[FEA]: Add `__pipeline_arrive_on_noinc` #2927

igor-nv · 2024-11-21T15:42:11Z

Is this a duplicate?

I confirmed there appear to be no duplicate issues for this request and that I agree to the Code of Conduct

Area

libcu++

Is your feature request related to a problem? Please describe.

Please expose a noinc version of __pipeline_arrive_on:
It allows to avoid double barrier arrive when synchronizing LDGSTS via a barrier.

Describe the solution you'd like

__device__ __forceinline__
void __pipeline_arrive_on_noinc (uint64_t* barrier)
{
    asm volatile (
        "cp.async.mbarrier.arrive.noinc.shared.b64 [%0];"
        :: "r"(static_cast<uint32_t>(__cvta_generic_to_shared(barrier)))
    );
}

Describe alternatives you've considered

No response

Additional context

No response

The text was updated successfully, but these errors were encountered:

ahendriksen · 2024-11-21T15:59:36Z

We would need to add the following .rst:

cp.async.mbarrier.arrive.b64
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda

   // cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80
   template <typename=void>
   __device__ static inline void cp_async_mbarrier_arrive(
     uint64_t* addr);

cp.async.mbarrier.arrive.noinc.b64
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda

   // cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80
   template <typename=void>
   __device__ static inline void cp_async_mbarrier_arrive_noinc(
     uint64_t* addr);

The following header code:

/*
// cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80
template <typename=void>
__device__ static inline void cp_async_mbarrier_arrive(
  uint64_t* addr);
*/
#if __cccl_ptx_isa >= 700
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__();
template <typename=void>
_CCCL_DEVICE static inline void cp_async_mbarrier_arrive(
  _CUDA_VSTD::uint64_t* __addr)
{
  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
    asm (
      "cp.async.mbarrier.arrive.b64 [%0];"
      :
      : "r"(__as_ptr_smem(__addr))
      : "memory"
    );
  ),(
    // Unsupported architectures will have a linker error with a semi-decent error message
    __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__();
  ));
}
#endif // __cccl_ptx_isa >= 700

/*
// cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80
template <typename=void>
__device__ static inline void cp_async_mbarrier_arrive_noinc(
  uint64_t* addr);
*/
#if __cccl_ptx_isa >= 700
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__();
template <typename=void>
_CCCL_DEVICE static inline void cp_async_mbarrier_arrive_noinc(
  _CUDA_VSTD::uint64_t* __addr)
{
  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
    asm (
      "cp.async.mbarrier.arrive.noinc.b64 [%0];"
      :
      : "r"(__as_ptr_smem(__addr))
      : "memory"
    );
  ),(
    // Unsupported architectures will have a linker error with a semi-decent error message
    __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__();
  ));
}
#endif // __cccl_ptx_isa >= 700

And the following tests:

__global__ void test_cp_async_mbarrier_arrive(void ** fn_ptr) {
#if __cccl_ptx_isa >= 700
  NV_IF_TARGET(NV_PROVIDES_SM_80, (
    // cp.async.mbarrier.arrive.b64 [addr];
    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(uint64_t* )>(cuda::ptx::cp_async_mbarrier_arrive));
  ));
#endif // __cccl_ptx_isa >= 700

#if __cccl_ptx_isa >= 700
  NV_IF_TARGET(NV_PROVIDES_SM_80, (
    // cp.async.mbarrier.arrive.noinc.b64 [addr];
    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(uint64_t* )>(cuda::ptx::cp_async_mbarrier_arrive_noinc));
  ));
#endif // __cccl_ptx_isa >= 700
}

igor-nv added the feature request New feature or request. label Nov 21, 2024

ahendriksen assigned ahendriksen and bernhardmgruber Nov 21, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[FEA]: Add `__pipeline_arrive_on_noinc` #2927

[FEA]: Add `__pipeline_arrive_on_noinc` #2927

igor-nv commented Nov 21, 2024

ahendriksen commented Nov 21, 2024

[FEA]: Add __pipeline_arrive_on_noinc #2927

[FEA]: Add __pipeline_arrive_on_noinc #2927

Comments

igor-nv commented Nov 21, 2024

Is this a duplicate?

Area

Is your feature request related to a problem? Please describe.

Describe the solution you'd like

Describe alternatives you've considered

Additional context

ahendriksen commented Nov 21, 2024

[FEA]: Add `__pipeline_arrive_on_noinc` #2927

[FEA]: Add `__pipeline_arrive_on_noinc` #2927