Skip to content

Commit

Permalink
PTX: Update existing instructions (#3584)
Browse files Browse the repository at this point in the history
* mbarrier.expect_tx: Add missing source and test
It was already documented(!)

* cp.async.bulk.tensor: Add .{gather,scatter}4
* fence: Add .sync_restrict, .proxy.async.sync_restrict

Co-authored-by: Allard Hendriksen <[email protected]>
  • Loading branch information
bernhardmgruber and ahendriksen authored Jan 30, 2025
1 parent 15a0116 commit 5ce5d28
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 0 deletions.
5 changes: 5 additions & 0 deletions docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,8 @@ Multicast
---------

.. include:: generated/cp_async_bulk_tensor_multicast.rst

Scatter / Gather
----------------

.. include:: generated/cp_async_bulk_tensor_gather_scatter.rst
10 changes: 10 additions & 0 deletions docs/libcudacxx/ptx/instructions/fence.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ fence

.. include:: generated/fence.rst

fence.sync_restrict
-------------------

.. include:: generated/fence_sync_restrict.rst

fence.mbarrier_init
-------------------

Expand All @@ -29,6 +34,11 @@ fence.proxy.async

.. include:: generated/fence_proxy_async.rst

fence.proxy.async.sync_restrict
-------------------------------

.. include:: generated/fence_proxy_async_generic_sync_restrict.rst

fence.proxy.tensormap
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h>
#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h>
#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h>

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
Expand Down
2 changes: 2 additions & 0 deletions libcudacxx/include/cuda/__ptx/instructions/fence.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.h>
#include <cuda/__ptx/instructions/generated/fence_proxy_alias.h>
#include <cuda/__ptx/instructions/generated/fence_proxy_async.h>
#include <cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h>
#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h>
#include <cuda/__ptx/instructions/generated/fence_sync_restrict.h>

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

Expand Down
37 changes: 37 additions & 0 deletions libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_PTX_MBARRIER_EXPECT_TX_H_
#define _CUDA_PTX_MBARRIER_EXPECT_TX_H_

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/__ptx/ptx_dot_variants.h>
#include <cuda/__ptx/ptx_helper_functions.h>
#include <cuda/std/cstdint>

#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX

#include <cuda/__ptx/instructions/generated/mbarrier_expect_tx.h>

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

#endif // _CUDA_PTX_MBARRIER_EXPECT_TX_H_
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/ptx
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/__ptx/instructions/getctarank.h>
#include <cuda/__ptx/instructions/mbarrier_arrive.h>
#include <cuda/__ptx/instructions/mbarrier_expect_tx.h>
#include <cuda/__ptx/instructions/mbarrier_init.h>
#include <cuda/__ptx/instructions/mbarrier_wait.h>
#include <cuda/__ptx/instructions/red_async.h>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "nvrtc_workaround.h"
// above header needs to be included before the generated test header
#include "generated/cp_async_bulk_tensor.h"
#include "generated/cp_async_bulk_tensor_gather_scatter.h"

int main(int, char**)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
#include "generated/fence_mbarrier_init.h"
#include "generated/fence_proxy_alias.h"
#include "generated/fence_proxy_async.h"
#include "generated/fence_proxy_async_generic_sync_restrict.h"
#include "generated/fence_proxy_tensormap_generic.h"
#include "generated/fence_sync_restrict.h"

int main(int, char**)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

#include "generated/mbarrier_expect_tx.h"

int main(int, char**)
{
return 0;
}

0 comments on commit 5ce5d28

Please sign in to comment.