Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEA]: Add __pipeline_arrive_on_noinc #2927

Open
1 task done
igor-nv opened this issue Nov 21, 2024 · 1 comment
Open
1 task done

[FEA]: Add __pipeline_arrive_on_noinc #2927

igor-nv opened this issue Nov 21, 2024 · 1 comment
Assignees
Labels
feature request New feature or request.

Comments

@igor-nv
Copy link

igor-nv commented Nov 21, 2024

Is this a duplicate?

Area

libcu++

Is your feature request related to a problem? Please describe.

Please expose a noinc version of __pipeline_arrive_on:
It allows to avoid double barrier arrive when synchronizing LDGSTS via a barrier.

Describe the solution you'd like

__device__ __forceinline__
void __pipeline_arrive_on_noinc (uint64_t* barrier)
{
    asm volatile (
        "cp.async.mbarrier.arrive.noinc.shared.b64 [%0];"
        :: "r"(static_cast<uint32_t>(__cvta_generic_to_shared(barrier)))
    );
}

Describe alternatives you've considered

No response

Additional context

No response

@igor-nv igor-nv added the feature request New feature or request. label Nov 21, 2024
@ahendriksen
Copy link
Contributor

We would need to add the following .rst:

cp.async.mbarrier.arrive.b64
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda

   // cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80
   template <typename=void>
   __device__ static inline void cp_async_mbarrier_arrive(
     uint64_t* addr);

cp.async.mbarrier.arrive.noinc.b64
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda

   // cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80
   template <typename=void>
   __device__ static inline void cp_async_mbarrier_arrive_noinc(
     uint64_t* addr);

The following header code:

/*
// cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80
template <typename=void>
__device__ static inline void cp_async_mbarrier_arrive(
  uint64_t* addr);
*/
#if __cccl_ptx_isa >= 700
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__();
template <typename=void>
_CCCL_DEVICE static inline void cp_async_mbarrier_arrive(
  _CUDA_VSTD::uint64_t* __addr)
{
  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
    asm (
      "cp.async.mbarrier.arrive.b64 [%0];"
      :
      : "r"(__as_ptr_smem(__addr))
      : "memory"
    );
  ),(
    // Unsupported architectures will have a linker error with a semi-decent error message
    __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__();
  ));
}
#endif // __cccl_ptx_isa >= 700

/*
// cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80
template <typename=void>
__device__ static inline void cp_async_mbarrier_arrive_noinc(
  uint64_t* addr);
*/
#if __cccl_ptx_isa >= 700
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__();
template <typename=void>
_CCCL_DEVICE static inline void cp_async_mbarrier_arrive_noinc(
  _CUDA_VSTD::uint64_t* __addr)
{
  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
    asm (
      "cp.async.mbarrier.arrive.noinc.b64 [%0];"
      :
      : "r"(__as_ptr_smem(__addr))
      : "memory"
    );
  ),(
    // Unsupported architectures will have a linker error with a semi-decent error message
    __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__();
  ));
}
#endif // __cccl_ptx_isa >= 700

And the following tests:

__global__ void test_cp_async_mbarrier_arrive(void ** fn_ptr) {
#if __cccl_ptx_isa >= 700
  NV_IF_TARGET(NV_PROVIDES_SM_80, (
    // cp.async.mbarrier.arrive.b64 [addr];
    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(uint64_t* )>(cuda::ptx::cp_async_mbarrier_arrive));
  ));
#endif // __cccl_ptx_isa >= 700

#if __cccl_ptx_isa >= 700
  NV_IF_TARGET(NV_PROVIDES_SM_80, (
    // cp.async.mbarrier.arrive.noinc.b64 [addr];
    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(uint64_t* )>(cuda::ptx::cp_async_mbarrier_arrive_noinc));
  ));
#endif // __cccl_ptx_isa >= 700
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
feature request New feature or request.
Projects
None yet
Development

No branches or pull requests

3 participants