Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Format libcudacxx/include files without extensions #1676

Merged
15 changes: 14 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,20 @@ repos:
rev: v17.0.6
hooks:
- id: clang-format
types_or: [c, c++, cuda]
types_or: [file]
files: |
(?x)^(
^.*\.c$|
^.*\.cpp$|
^.*\.cu$|
^.*\.cuh$|
^.*\.cxx$|
^.*\.h$|
^.*\.hpp$|
^.*\.inl$|
^.*\.mm$|
^libcudacxx/include/.*/[^.]*$
)
args: ["-fallback-style=none", "-style=file", "-i"]

default_language_version:
Expand Down
596 changes: 360 additions & 236 deletions libcudacxx/include/cuda/annotated_ptr

Large diffs are not rendered by default.

226 changes: 118 additions & 108 deletions libcudacxx/include/cuda/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
# pragma system_header
#endif // no system header

#include <cuda/std/barrier>
#include <cuda/ptx>
#include <cuda/std/barrier>

// Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping
// functions. These functions take a pointer to CUtensorMap, so do not need to
Expand Down Expand Up @@ -54,175 +54,185 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _CCCL_DEVICE
void cp_async_bulk_global_to_shared(void *__dest, const void *__src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
_LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");

_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __src, __size,
::cuda::device::barrier_native_handle(__bar));
_LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");

_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__src,
__size,
::cuda::device::barrier_native_handle(__bar));
}


// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _CCCL_DEVICE
void cp_async_bulk_shared_to_global(void *__dest, const void * __src, _CUDA_VSTD::uint32_t __size)
inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size)
{
_LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");

_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__dest, __src, __size);
_CUDA_VPTX::cp_async_bulk(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __dest, __src, __size);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_1d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_2d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_3d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
void* __dest,
const CUtensorMap* __tensor_map,
int __c0,
int __c1,
int __c2,
::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_4d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
void* __dest,
const CUtensorMap* __tensor_map,
int __c0,
int __c1,
int __c2,
int __c3,
::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_5d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, int __c4, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
void* __dest,
const CUtensorMap* __tensor_map,
int __c0,
int __c1,
int __c2,
int __c3,
int __c4,
::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_1d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, const void *__src)
inline _CCCL_DEVICE void
cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0};
const _CUDA_VSTD::int32_t __coords[]{__c0};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_2d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, const void *__src)
inline _CCCL_DEVICE void
cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_3d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, const void *__src)
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_4d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, const void *__src)
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_5d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void *__src)
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
inline _CCCL_DEVICE
void fence_proxy_async_shared_cta() {
_CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
inline _CCCL_DEVICE void fence_proxy_async_shared_cta()
{
_CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
inline _CCCL_DEVICE
void cp_async_bulk_commit_group()
inline _CCCL_DEVICE void cp_async_bulk_commit_group()
{
_CUDA_VPTX::cp_async_bulk_commit_group();
_CUDA_VPTX::cp_async_bulk_commit_group();
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
template <int __n_prior>
inline _CCCL_DEVICE
void cp_async_bulk_wait_group_read()
inline _CCCL_DEVICE void cp_async_bulk_wait_group_read()
{
static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
_CUDA_VPTX::cp_async_bulk_wait_group_read(_CUDA_VPTX::n32_t<__n_prior>{});
Expand Down
10 changes: 5 additions & 5 deletions libcudacxx/include/cuda/discard_memory
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, size_t __nbyt
NV_PROVIDES_SM_80,
(if (!__isGlobal((void*) __ptr)) return;

char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
char* const __end_p = __p + __nbytes;
char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
char* const __end_p = __p + __nbytes;
static constexpr size_t _LINE_SIZE = 128;

// Trim the first block and last block if they're not 128 bytes aligned
size_t __misalignment = reinterpret_cast<uintptr_t>(__p) % _LINE_SIZE;
char* __start_aligned = __misalignment == 0 ? __p : __p + (_LINE_SIZE - __misalignment);
char* const __end_aligned = __end_p - (reinterpret_cast<uintptr_t>(__end_p) % _LINE_SIZE);
size_t __misalignment = reinterpret_cast<uintptr_t>(__p) % _LINE_SIZE;
char* __start_aligned = __misalignment == 0 ? __p : __p + (_LINE_SIZE - __misalignment);
char* const __end_aligned = __end_p - (reinterpret_cast<uintptr_t>(__end_p) % _LINE_SIZE);

while (__start_aligned < __end_aligned) {
asm volatile("discard.global.L2 [%0], 128;" ::"l"(__start_aligned) :);
Expand Down
Loading
Loading