Skip to content

Commit

Permalink
First pass at making atomic use new backends
Browse files Browse the repository at this point in the history
  • Loading branch information
wmaxey committed Apr 13, 2024
1 parent 67626f5 commit e085e18
Show file tree
Hide file tree
Showing 18 changed files with 1,636 additions and 820 deletions.
10 changes: 7 additions & 3 deletions libcudacxx/codegen/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,11 @@ int main()
//
//===----------------------------------------------------------------------===//
// This is a autogenerated file, we want to ensure that it contains exactly the contentes we want to generate
// This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate
// clang-format off
_LIBCUDACXX_BEGIN_NAMESPACE_STD
)XXX";

auto scopenametag = [&](auto scope) {
Expand Down Expand Up @@ -302,11 +305,11 @@ int main()
{
out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8 << ", int> = 0>\n";
out << "_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(" << cv
<< "_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int "
<< "_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int "
"__failure_memorder, "
<< scopenametag(s.first) << ") {\n";
out << " uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n";
out << " memcpy(&__tmp, __desired, " << sz / 8 << ");\n";
out << " memcpy(&__tmp, &__desired, " << sz / 8 << ");\n";
out << " memcpy(&__old, __expected, " << sz / 8 << ");\n";
out << " __old_tmp = __old;\n";
out << " NV_DISPATCH_TARGET(\n";
Expand Down Expand Up @@ -507,6 +510,7 @@ int main()
}
}

out << "\n_LIBCUDACXX_END_NAMESPACE_STD\n";
out << "\n// clang-format on\n";

return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
//
//===----------------------------------------------------------------------===//

// This is a autogenerated file, we want to ensure that it contains exactly the contentes we want to generate
// This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate
// clang-format off

_LIBCUDACXX_BEGIN_NAMESPACE_STD

static inline _LIBCUDACXX_DEVICE void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); }
static inline _LIBCUDACXX_DEVICE void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); }
static inline _LIBCUDACXX_DEVICE void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); }
Expand Down Expand Up @@ -249,9 +252,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
uint32_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 4);
memcpy(&__tmp, &__desired, 4);
memcpy(&__old, __expected, 4);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -283,9 +286,9 @@ _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _T
return __ret;
}
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
uint32_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 4);
memcpy(&__tmp, &__desired, 4);
memcpy(&__old, __expected, 4);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -1156,9 +1159,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
uint64_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 8);
memcpy(&__tmp, &__desired, 8);
memcpy(&__old, __expected, 8);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -1190,9 +1193,9 @@ _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _T
return __ret;
}
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
uint64_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 8);
memcpy(&__tmp, &__desired, 8);
memcpy(&__old, __expected, 8);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -2426,9 +2429,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
uint32_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 4);
memcpy(&__tmp, &__desired, 4);
memcpy(&__old, __expected, 4);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -2460,9 +2463,9 @@ _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _T
return __ret;
}
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
uint32_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 4);
memcpy(&__tmp, &__desired, 4);
memcpy(&__old, __expected, 4);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -3333,9 +3336,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
uint64_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 8);
memcpy(&__tmp, &__desired, 8);
memcpy(&__old, __expected, 8);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -3367,9 +3370,9 @@ _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _T
return __ret;
}
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
uint64_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 8);
memcpy(&__tmp, &__desired, 8);
memcpy(&__old, __expected, 8);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -4603,9 +4606,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
uint32_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 4);
memcpy(&__tmp, &__desired, 4);
memcpy(&__old, __expected, 4);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -4637,9 +4640,9 @@ _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _T
return __ret;
}
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
uint32_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 4);
memcpy(&__tmp, &__desired, 4);
memcpy(&__old, __expected, 4);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -5510,9 +5513,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _LIBCUDACXX_DEVICE void __cuda_compare_exchange_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
uint64_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 8);
memcpy(&__tmp, &__desired, 8);
memcpy(&__old, __expected, 8);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -5544,9 +5547,9 @@ _LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _T
return __ret;
}
template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
_LIBCUDACXX_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
uint64_t __tmp = 0, __old = 0, __old_tmp;
memcpy(&__tmp, __desired, 8);
memcpy(&__tmp, &__desired, 8);
memcpy(&__old, __expected, 8);
__old_tmp = __old;
NV_DISPATCH_TARGET(
Expand Down Expand Up @@ -6542,4 +6545,6 @@ _LIBCUDACXX_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val
return __ret;
}

_LIBCUDACXX_END_NAMESPACE_STD

// clang-format on
Loading

0 comments on commit e085e18

Please sign in to comment.