From 83b10c27884dad6006280f6c9b14234f413da704 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 28 Jan 2025 21:55:48 +0100 Subject: [PATCH 01/33] Update ptx_isa.h to include 8.6 and 8.7 (#3563) --- libcudacxx/include/cuda/std/__cccl/ptx_isa.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/std/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/__cccl/ptx_isa.h index c351d402a31..e64504fb1d8 100644 --- a/libcudacxx/include/cuda/std/__cccl/ptx_isa.h +++ b/libcudacxx/include/cuda/std/__cccl/ptx_isa.h @@ -31,11 +31,14 @@ * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes */ -// PTX ISA 8.5 is available from CUDA 12.5 +// PTX ISA 8.7 is available from CUDA 12.8 // The first define is for future major versions of CUDACC. // We make sure that these get the highest known PTX ISA version. #if _CCCL_CUDACC_AT_LEAST(13, 0) -# define __cccl_ptx_isa 850ULL +# define __cccl_ptx_isa 870ULL +// PTX ISA 8.7 is available from CUDA 12.8, driver r570 +#elif _CCCL_CUDACC_AT_LEAST(12, 8) +# define __cccl_ptx_isa 870ULL // PTX ISA 8.5 is available from CUDA 12.5, driver r555 #elif _CCCL_CUDACC_AT_LEAST(12, 5) # define __cccl_ptx_isa 850ULL From 2a03b6e78ef476b596eb34a42c0b38fb15489efe Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Tue, 28 Jan 2025 19:10:33 -0800 Subject: [PATCH 02/33] add missing visibility annotations to ustdex types that have data members (#3571) --- .../experimental/__async/sender/basic_sender.cuh | 6 +++--- .../cuda/experimental/__async/sender/conditional.cuh | 4 ++-- .../cuda/experimental/__async/sender/continue_on.cuh | 10 +++++----- .../include/cuda/experimental/__async/sender/env.cuh | 6 +++--- .../cuda/experimental/__async/sender/just.cuh | 4 ++-- .../cuda/experimental/__async/sender/just_from.cuh | 2 +- .../cuda/experimental/__async/sender/let_value.cuh | 4 ++-- .../cuda/experimental/__async/sender/read_env.cuh | 8 ++++---- .../cuda/experimental/__async/sender/sequence.cuh | 6 +++--- .../experimental/__async/sender/start_detached.cuh | 4 ++-- .../cuda/experimental/__async/sender/start_on.cuh | 6 +++--- .../cuda/experimental/__async/sender/sync_wait.cuh | 2 +- .../cuda/experimental/__async/sender/then.cuh | 4 ++-- .../experimental/__async/sender/thread_context.cuh | 2 +- .../cuda/experimental/__async/sender/when_all.cuh | 12 ++++++------ .../cuda/experimental/__async/sender/write_env.cuh | 6 +++--- 16 files changed, 43 insertions(+), 43 deletions(-) diff --git a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh index ae8ad239d46..273daaa5e07 100644 --- a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh +++ b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh @@ -75,7 +75,7 @@ struct receiver_defaults }; template -struct basic_receiver +struct _CCCL_TYPE_VISIBILITY_DEFAULT basic_receiver { using receiver_concept = __async::receiver_t; using __rcvr_t = typename _Data::receiver_tag; @@ -212,10 +212,10 @@ _CUDAX_TRIVIAL_API auto __get_attrs(long, const _Data&, const _Sndrs&... __sndrs } template -struct basic_sender; +struct _CCCL_TYPE_VISIBILITY_DEFAULT basic_sender; template -struct basic_sender<_Data, _Sndr> +struct _CCCL_TYPE_VISIBILITY_DEFAULT basic_sender<_Data, _Sndr> { using sender_concept = __async::sender_t; using __tag_t = typename _Data::sender_tag; diff --git a/cudax/include/cuda/experimental/__async/sender/conditional.cuh b/cudax/include/cuda/experimental/__async/sender/conditional.cuh index 89aca853851..d913d315638 100644 --- a/cudax/include/cuda/experimental/__async/sender/conditional.cuh +++ b/cudax/include/cuda/experimental/__async/sender/conditional.cuh @@ -135,7 +135,7 @@ struct __cond_t }; template - struct __sndr_t; + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; template struct __closure @@ -174,7 +174,7 @@ struct __cond_t }; template -struct __cond_t::__sndr_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT __cond_t::__sndr_t { __cond_t __tag_; __cond_t::__data<_Pred, _Then, _Else> __data_; diff --git a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh index 8da87a443a3..b0ccd33fb9e 100644 --- a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh +++ b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh @@ -65,7 +65,7 @@ private: completion_signatures), set_error_t(::std::exception_ptr)>>; template - struct __rcvr_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t { using receiver_concept = receiver_t; _Rcvr __rcvr_; @@ -127,7 +127,7 @@ private: }; template - struct __opstate_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t { _CUDAX_API friend auto get_env(const __opstate_t* __self) noexcept -> env_of_t<_Rcvr> { @@ -197,7 +197,7 @@ private: }; template - struct __sndr_t; + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; template struct __closure_t; @@ -211,7 +211,7 @@ public: }; template -struct continue_on_t::__closure_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT continue_on_t::__closure_t { _Sch __sch; @@ -223,7 +223,7 @@ struct continue_on_t::__closure_t }; template -struct continue_on_t::__sndr_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT continue_on_t::__sndr_t { using sender_concept = sender_t; _CCCL_NO_UNIQUE_ADDRESS continue_on_t __tag; diff --git a/cudax/include/cuda/experimental/__async/sender/env.cuh b/cudax/include/cuda/experimental/__async/sender/env.cuh index 3254181642b..eb0e232fead 100644 --- a/cudax/include/cuda/experimental/__async/sender/env.cuh +++ b/cudax/include/cuda/experimental/__async/sender/env.cuh @@ -63,7 +63,7 @@ template using __unwrap_reference_t = decltype(__unwrap_ref<_Ty>); template -struct prop +struct _CCCL_TYPE_VISIBILITY_DEFAULT prop { _CCCL_NO_UNIQUE_ADDRESS _Query __query; _CCCL_NO_UNIQUE_ADDRESS _Value __value; @@ -77,7 +77,7 @@ struct prop }; template -struct env +struct _CCCL_TYPE_VISIBILITY_DEFAULT env { __tuple<_Envs...> __envs_; @@ -108,7 +108,7 @@ struct env // partial specialization for two environments template -struct env<_Env0, _Env1> +struct _CCCL_TYPE_VISIBILITY_DEFAULT env<_Env0, _Env1> { _CCCL_NO_UNIQUE_ADDRESS _Env0 __env0_; _CCCL_NO_UNIQUE_ADDRESS _Env1 __env1_; diff --git a/cudax/include/cuda/experimental/__async/sender/just.cuh b/cudax/include/cuda/experimental/__async/sender/just.cuh index 3230b40ca93..3570de31624 100644 --- a/cudax/include/cuda/experimental/__async/sender/just.cuh +++ b/cudax/include/cuda/experimental/__async/sender/just.cuh @@ -61,7 +61,7 @@ private: using _SetTag = decltype(__detail::__set_tag<_Disposition>()); template - struct __opstate_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t { using operation_state_concept = operation_state_t; using completion_signatures = __async::completion_signatures<_SetTag(_Ts...)>; @@ -85,7 +85,7 @@ private: }; template - struct __sndr_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t { using sender_concept = sender_t; using completion_signatures = __async::completion_signatures<_SetTag(_Ts...)>; diff --git a/cudax/include/cuda/experimental/__async/sender/just_from.cuh b/cudax/include/cuda/experimental/__async/sender/just_from.cuh index 40df8b56825..f1e2ec87016 100644 --- a/cudax/include/cuda/experimental/__async/sender/just_from.cuh +++ b/cudax/include/cuda/experimental/__async/sender/just_from.cuh @@ -112,7 +112,7 @@ private: }; template - struct __sndr_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t { using sender_concept = sender_t; diff --git a/cudax/include/cuda/experimental/__async/sender/let_value.cuh b/cudax/include/cuda/experimental/__async/sender/let_value.cuh index 6742a1c1d6c..f681b0ad88b 100644 --- a/cudax/include/cuda/experimental/__async/sender/let_value.cuh +++ b/cudax/include/cuda/experimental/__async/sender/let_value.cuh @@ -146,7 +146,7 @@ private: /// @tparam _Rcvr The receiver connected to the `let_(value|error|stopped)` /// sender. template - struct __opstate_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t { _CUDAX_API friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept { @@ -235,7 +235,7 @@ private: /// @tparam _Fn The function to be called when the predecessor sender /// completes. template - struct __sndr_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t { using sender_concept = sender_t; _CCCL_NO_UNIQUE_ADDRESS _LetTag __tag_; diff --git a/cudax/include/cuda/experimental/__async/sender/read_env.cuh b/cudax/include/cuda/experimental/__async/sender/read_env.cuh index 758e37c5714..f62c0d27bd5 100644 --- a/cudax/include/cuda/experimental/__async/sender/read_env.cuh +++ b/cudax/include/cuda/experimental/__async/sender/read_env.cuh @@ -61,7 +61,7 @@ private: }; template - struct __opstate_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t { using operation_state_concept = operation_state_t; using completion_signatures = // @@ -107,7 +107,7 @@ private: // This makes read_env a dependent sender: template - struct __opstate_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t { using operation_state_concept = operation_state_t; using completion_signatures = dependent_completions; @@ -116,7 +116,7 @@ private: }; template - struct __sndr_t; + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; public: /// @brief Returns a sender that, when connected to a receiver and started, @@ -127,7 +127,7 @@ public: }; template -struct read_env_t::__sndr_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT read_env_t::__sndr_t { using sender_concept = sender_t; _CCCL_NO_UNIQUE_ADDRESS read_env_t __tag; diff --git a/cudax/include/cuda/experimental/__async/sender/sequence.cuh b/cudax/include/cuda/experimental/__async/sender/sequence.cuh index 3f8b17c7413..87c5f05c2b7 100644 --- a/cudax/include/cuda/experimental/__async/sender/sequence.cuh +++ b/cudax/include/cuda/experimental/__async/sender/sequence.cuh @@ -43,7 +43,7 @@ struct __seq }; template - struct __opstate + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate { using operation_state_concept = operation_state_t; @@ -99,14 +99,14 @@ struct __seq }; template - struct __sndr_t; + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; template _CUDAX_API auto operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const -> __sndr_t<_Sndr1, _Sndr2>; }; template -struct __seq::__sndr_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT __seq::__sndr_t { using sender_concept = sender_t; using __sndr1_t = _Sndr1; diff --git a/cudax/include/cuda/experimental/__async/sender/start_detached.cuh b/cudax/include/cuda/experimental/__async/sender/start_detached.cuh index df9d5c4e69e..7b4c620e5c1 100644 --- a/cudax/include/cuda/experimental/__async/sender/start_detached.cuh +++ b/cudax/include/cuda/experimental/__async/sender/start_detached.cuh @@ -40,7 +40,7 @@ private: struct __opstate_base_t : __immovable {}; - struct __rcvr_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t { using receiver_concept = receiver_t; @@ -66,7 +66,7 @@ private: }; template - struct __opstate_t : __opstate_base_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t : __opstate_base_t { using operation_state_concept = operation_state_t; using completion_signatures = __async::completion_signatures_of_t<_Sndr, __rcvr_t>; diff --git a/cudax/include/cuda/experimental/__async/sender/start_on.cuh b/cudax/include/cuda/experimental/__async/sender/start_on.cuh index e2a04be7176..8656a57d489 100644 --- a/cudax/include/cuda/experimental/__async/sender/start_on.cuh +++ b/cudax/include/cuda/experimental/__async/sender/start_on.cuh @@ -52,7 +52,7 @@ private: #endif // !_CCCL_CUDA_COMPILER(NVCC) template - struct __opstate_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t { _CUDAX_API friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept { @@ -103,7 +103,7 @@ private: }; template - struct __sndr_t; + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; public: template @@ -112,7 +112,7 @@ public: } start_on{}; template -struct start_on_t::__sndr_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT start_on_t::__sndr_t { using sender_concept = sender_t; _CCCL_NO_UNIQUE_ADDRESS start_on_t __tag_; diff --git a/cudax/include/cuda/experimental/__async/sender/sync_wait.cuh b/cudax/include/cuda/experimental/__async/sender/sync_wait.cuh index 2f501b985df..ddf23694272 100644 --- a/cudax/include/cuda/experimental/__async/sender/sync_wait.cuh +++ b/cudax/include/cuda/experimental/__async/sender/sync_wait.cuh @@ -67,7 +67,7 @@ private: template struct __state_t { - struct __rcvr_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t { using receiver_concept = receiver_t; __state_t* __state_; diff --git a/cudax/include/cuda/experimental/__async/sender/then.cuh b/cudax/include/cuda/experimental/__async/sender/then.cuh index b2ab494f8d0..d9f6ca9ec3e 100644 --- a/cudax/include/cuda/experimental/__async/sender/then.cuh +++ b/cudax/include/cuda/experimental/__async/sender/then.cuh @@ -126,7 +126,7 @@ private: __type_try_quote<__concat_completion_signatures>::__call>; template - struct __opstate_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t { _CUDAX_API friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept { @@ -213,7 +213,7 @@ private: }; template - struct __sndr_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t { using sender_concept = sender_t; _CCCL_NO_UNIQUE_ADDRESS _UponTag __tag_; diff --git a/cudax/include/cuda/experimental/__async/sender/thread_context.cuh b/cudax/include/cuda/experimental/__async/sender/thread_context.cuh index 4a7c768ed25..c7b798a1734 100644 --- a/cudax/include/cuda/experimental/__async/sender/thread_context.cuh +++ b/cudax/include/cuda/experimental/__async/sender/thread_context.cuh @@ -33,7 +33,7 @@ namespace cuda::experimental::__async { -struct thread_context +struct _CCCL_TYPE_VISIBILITY_DEFAULT thread_context { thread_context() noexcept : __thrd_{[this] { diff --git a/cudax/include/cuda/experimental/__async/sender/when_all.cuh b/cudax/include/cuda/experimental/__async/sender/when_all.cuh index 1274a725e44..de5f089b979 100644 --- a/cudax/include/cuda/experimental/__async/sender/when_all.cuh +++ b/cudax/include/cuda/experimental/__async/sender/when_all.cuh @@ -53,10 +53,10 @@ template struct __env_t; template -struct __rcvr_t; +struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t; template -struct __opstate_t; +struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t; using __tombstone = _ERROR<_WHERE(_IN_ALGORITHM, when_all_t), _WHAT(_SENDER_HAS_TOO_MANY_SUCCESS_COMPLETIONS)>; @@ -322,7 +322,7 @@ struct __env_t }; template -struct __rcvr_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t { using receiver_concept = receiver_t; using __state_t = __unzip<_StateZip>; @@ -524,7 +524,7 @@ struct __state_t<_Rcvr, _CvFn, __tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Sndr /// The operation state for when_all template -struct __opstate_t<_Rcvr, _CvFn, __tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Sndrs...>> +struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t<_Rcvr, _CvFn, __tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Sndrs...>> { using operation_state_concept = operation_state_t; using __sndrs_t = _CUDA_VSTD::__type_call<_CvFn, __tuple<_Sndrs...>>; @@ -605,7 +605,7 @@ struct __opstate_t<_Rcvr, _CvFn, __tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Sn }; template -struct __sndr_t; +struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; } // namespace __when_all struct when_all_t @@ -616,7 +616,7 @@ struct when_all_t // The sender for when_all template -struct __when_all::__sndr_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT __when_all::__sndr_t { using sender_concept = sender_t; using __sndrs_t = __tuple<_Sndrs...>; diff --git a/cudax/include/cuda/experimental/__async/sender/write_env.cuh b/cudax/include/cuda/experimental/__async/sender/write_env.cuh index 9cb61288671..1a9d6b913a8 100644 --- a/cudax/include/cuda/experimental/__async/sender/write_env.cuh +++ b/cudax/include/cuda/experimental/__async/sender/write_env.cuh @@ -41,7 +41,7 @@ private: #endif // !_CCCL_CUDA_COMPILER(NVCC) template - struct __opstate_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t { using operation_state_concept = operation_state_t; using completion_signatures = completion_signatures_of_t<_Sndr, __rcvr_with_env_t<_Rcvr, _Env>*>; @@ -63,7 +63,7 @@ private: }; template - struct __sndr_t; + struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; public: /// @brief Wraps one sender in another that modifies the execution @@ -74,7 +74,7 @@ public: }; template -struct write_env_t::__sndr_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT write_env_t::__sndr_t { using sender_concept = sender_t; _CCCL_NO_UNIQUE_ADDRESS write_env_t __tag_; From 74c17c662568c33bdd39532aa423c057c2f6d305 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <158148890+caugonnet@users.noreply.github.com> Date: Wed, 29 Jan 2025 17:57:00 +0100 Subject: [PATCH 03/33] [STF] Document dot sections (#3506) * Start to document STF dot sections * fix formatting * Minor fixes in the doc * Add missing file * clang-format * Remove dot_push_section and dot_pop_section and also fix a bazillion warnings * Format * More Format * Add missing mv * misc. C++ fixes and clang-format * Update dot_section doc to reflect that we removed dot_push_section and dot_pop_section * - Fix documentation error - Use the dot_section doc example as a test - do not use assert directly * Review and a few more touches * Improvement for docs/cudax/stf.rst Co-authored-by: Bernhard Manfred Gruber --------- Co-authored-by: Andrei Alexandrescu Co-authored-by: Bernhard Manfred Gruber --- .../stf/linear_algebra/07-cholesky.cu | 8 +- cudax/examples/stf/linear_algebra/07-potri.cu | 8 +- cudax/examples/stf/linear_algebra/cg_csr.cu | 9 ++- .../__stf/internal/backend_ctx.cuh | 10 --- .../cuda/experimental/__stf/internal/dot.cuh | 33 ++++++-- cudax/include/cuda/experimental/stf.cuh | 28 +------ cudax/test/stf/CMakeLists.txt | 1 + .../jacobiCudaGraphs/jacobi.cu | 2 +- cudax/test/stf/dot/sections_2.cu | 49 ++++++++++++ cudax/test/stf/examples/07-cholesky-redux.cu | 46 ++++++------ .../test/stf/examples/07-cholesky-unified.cu | 48 ++++++------ cudax/test/stf/gnu/06-pdgemm.cpp | 20 ++--- cudax/test/stf/gnu/07-cholesky.cpp | 48 ++++++------ docs/cudax/stf.rst | 71 ++++++++++++++++++ docs/cudax/stf/images/dag-sections-0.dot | 5 ++ docs/cudax/stf/images/dag-sections-0.png | Bin 0 -> 7100 bytes docs/cudax/stf/images/dag-sections-1.dot | 14 ++++ docs/cudax/stf/images/dag-sections-1.png | Bin 0 -> 12081 bytes docs/cudax/stf/images/dag-sections-2.dot | 36 +++++++++ docs/cudax/stf/images/dag-sections-2.png | Bin 0 -> 42824 bytes docs/cudax/stf/images/dag-sections.dot | 69 +++++++++++++++++ docs/cudax/stf/images/dag-sections.png | Bin 0 -> 103821 bytes 22 files changed, 369 insertions(+), 136 deletions(-) create mode 100644 cudax/test/stf/dot/sections_2.cu create mode 100644 docs/cudax/stf/images/dag-sections-0.dot create mode 100644 docs/cudax/stf/images/dag-sections-0.png create mode 100644 docs/cudax/stf/images/dag-sections-1.dot create mode 100644 docs/cudax/stf/images/dag-sections-1.png create mode 100644 docs/cudax/stf/images/dag-sections-2.dot create mode 100644 docs/cudax/stf/images/dag-sections-2.png create mode 100644 docs/cudax/stf/images/dag-sections.dot create mode 100644 docs/cudax/stf/images/dag-sections.png diff --git a/cudax/examples/stf/linear_algebra/07-cholesky.cu b/cudax/examples/stf/linear_algebra/07-cholesky.cu index 5c7947fc5ac..41b6f053bb3 100644 --- a/cudax/examples/stf/linear_algebra/07-cholesky.cu +++ b/cudax/examples/stf/linear_algebra/07-cholesky.cu @@ -659,14 +659,14 @@ int main(int argc, char** argv) return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row); }; - ctx.dot_push_section("fillA"); + auto s = ctx.dot_section("fillA"); if (check_result) { Aref.fill(hilbert); } A.fill(hilbert); - ctx.dot_pop_section(); + s.end(); /* Right-hand side */ matrix B_potrs(N, 1, NB, 1, false, "B"); @@ -693,9 +693,9 @@ int main(int argc, char** argv) cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; float milliseconds_pdpotrf = 0; - // for (int row = 0; row < A.mt; row++) + // for (size_t row = 0; row < A.mt; row++) // { - // for (int col = 0; col <= row; col++) + // for (size_t col = 0; col <= row; col++) // { // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); // NOOP(A, row, col); diff --git a/cudax/examples/stf/linear_algebra/07-potri.cu b/cudax/examples/stf/linear_algebra/07-potri.cu index e80fbffa663..37b9d93f78a 100644 --- a/cudax/examples/stf/linear_algebra/07-potri.cu +++ b/cudax/examples/stf/linear_algebra/07-potri.cu @@ -197,17 +197,17 @@ public: void print() { // print blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block ctx.host_launch(get_handle(rowb, colb).read())->*[=](auto sA) { - for (int lcol = 0; lcol < sA.extent(1); lcol++) + for (size_t lcol = 0; lcol < sA.extent(1); lcol++) { size_t col = lcol + colb * sA.extent(1); - for (int lrow = 0; lrow < sA.extent(0); lrow++) + for (size_t lrow = 0; lrow < sA.extent(0); lrow++) { size_t row = lrow + rowb * sA.extent(0); diff --git a/cudax/examples/stf/linear_algebra/cg_csr.cu b/cudax/examples/stf/linear_algebra/cg_csr.cu index ec0d3f1c9ff..bfa537d1995 100644 --- a/cudax/examples/stf/linear_algebra/cg_csr.cu +++ b/cudax/examples/stf/linear_algebra/cg_csr.cu @@ -51,7 +51,7 @@ public: static void copy_vector(const vector& from, vector& to) { to.ctx.parallel_for(to.handle.shape(), to.handle.write(), from.handle.read()).set_symbol("copy_vector") - ->*[] _CCCL_DEVICE(size_t i, slice dto, slice dfrom) { + ->*[] _CCCL_DEVICE(size_t i, slice dto, slice dfrom) { dto(i) = dfrom(i); }; } @@ -116,6 +116,13 @@ public: copy_scalar(a, *this); } + scalar& operator=(scalar&& a) + { + handle = mv(a.handle); + ctx = mv(a.ctx); + return *this; + } + scalar operator/(scalar const& rhs) const { // Submit a task that computes this/rhs diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index 4700ced4174..2ef462f103a 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -955,16 +955,6 @@ public: reserved::per_ctx_dot::set_parent_ctx(parent_ctx.get_dot(), get_dot()); } - void dot_push_section(::std::string symbol) const - { - reserved::dot::section::push(mv(symbol)); - } - - void dot_pop_section() const - { - reserved::dot::section::pop(); - } - auto dot_section(::std::string symbol) const { return reserved::dot::section::guard(mv(symbol)); diff --git a/cudax/include/cuda/experimental/__stf/internal/dot.cuh b/cudax/include/cuda/experimental/__stf/internal/dot.cuh index d711f32a6c4..cda74a1a899 100644 --- a/cudax/include/cuda/experimental/__stf/internal/dot.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/dot.cuh @@ -163,7 +163,7 @@ public: static int get_current_section_id(); template - void add_vertex(task_type t) + void add_vertex(const task_type& t) { // Do this work outside the critical section const auto remove_deps = getenv("CUDASTF_DOT_REMOVE_DATA_DEPS"); @@ -208,7 +208,7 @@ public: } template - void add_vertex_timing(task_type t, float time_ms, int device = -1) + void add_vertex_timing(const task_type& t, float time_ms, int device = -1) { ::std::lock_guard<::std::mutex> guard(mtx); @@ -286,7 +286,7 @@ public: ::std::shared_ptr parent; ::std::vector<::std::shared_ptr> children; - const ::std::string get_ctx_symbol() const + const ::std::string& get_ctx_symbol() const { return ctx_symbol; } @@ -352,7 +352,10 @@ public: // Constructor to initialize symbol and children section(::std::string sym) : symbol(mv(sym)) - {} + { + static_assert(::std::is_move_constructible_v
, "section must be move constructible"); + static_assert(::std::is_move_assignable_v
, "section must be move assignable"); + } class guard { @@ -362,10 +365,24 @@ public: section::push(mv(symbol)); } - ~guard() + void end() { + _CCCL_ASSERT(active, "Attempting to end the same section twice."); section::pop(); + active = false; + } + + ~guard() + { + if (active) + { + section::pop(); + } } + + private: + // Have we called end() ? + bool active = true; }; static auto& current() @@ -380,7 +397,7 @@ public: auto sec = ::std::make_shared
(mv(symbol)); int id = sec->get_id(); - int parent_id = current().size() == 0 ? 0 : current().top(); + int parent_id = current().empty() ? 0 : current().top(); sec->parent_id = parent_id; // Save the section in the map @@ -416,7 +433,7 @@ public: return 1 + int(id); } - const ::std::string get_symbol() const + const ::std::string& get_symbol() const { return symbol; } @@ -431,7 +448,7 @@ public: ::std::vector children_ids; private: - int depth; + int depth = ::std::numeric_limits::min(); ::std::string symbol; diff --git a/cudax/include/cuda/experimental/stf.cuh b/cudax/include/cuda/experimental/stf.cuh index 267a0dbd4b1..276c5725c5f 100644 --- a/cudax/include/cuda/experimental/stf.cuh +++ b/cudax/include/cuda/experimental/stf.cuh @@ -635,32 +635,6 @@ public: payload); } - /** - * @brief Start a new section in the DOT file identified by its symbol - */ - void dot_push_section(::std::string symbol) const - { - _CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized"); - ::std::visit( - [symbol = mv(symbol)](auto& self) { - self.dot_push_section(symbol); - }, - payload); - } - - /** - * @brief Ends current dot section - */ - void dot_pop_section() const - { - _CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized"); - ::std::visit( - [](auto& self) { - self.dot_pop_section(); - }, - payload); - } - /** * @brief RAII-style description of a new section in the DOT file identified by its symbol */ @@ -668,7 +642,7 @@ public: { _CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized"); return ::std::visit( - [symbol = mv(symbol)](auto& self) { + [&symbol](auto& self) { return self.dot_section(symbol); }, payload); diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt index 20527586cfe..d53b2e22140 100644 --- a/cudax/test/stf/CMakeLists.txt +++ b/cudax/test/stf/CMakeLists.txt @@ -8,6 +8,7 @@ set(stf_test_sources dot/basic.cu dot/graph_print_to_dot.cu dot/sections.cu + dot/sections_2.cu dot/with_events.cu error_checks/ctx_mismatch.cu error_checks/data_interface_mismatch.cu diff --git a/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu b/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu index 4b8e8be4600..1e0e7235080 100644 --- a/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu +++ b/cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu @@ -228,7 +228,7 @@ static __global__ void finalError(double* x, double* g_sum) double JacobiMethodGpuCudaGraphExecKernelSetParams( const float* A, const double* b, - const float conv_threshold, + float conv_threshold, const int max_iter, double* x, double* x_new, diff --git a/cudax/test/stf/dot/sections_2.cu b/cudax/test/stf/dot/sections_2.cu new file mode 100644 index 00000000000..d4cc74cf621 --- /dev/null +++ b/cudax/test/stf/dot/sections_2.cu @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * @brief This test makes sure we can generate a dot file with sections + */ + +#include + +using namespace cuda::experimental::stf; + +int main() +{ +// TODO (miscco): Make it work for windows +#if !_CCCL_COMPILER(MSVC) + context ctx; + auto lA = ctx.logical_token().set_symbol("A"); + auto lB = ctx.logical_token().set_symbol("B"); + auto lC = ctx.logical_token().set_symbol("C"); + + // Begin a top-level section named "foo" + auto s_foo = ctx.dot_section("foo"); + for (size_t i = 0; i < 2; i++) + { + // Section named "bar" using RAII + auto s_bar = ctx.dot_section("bar"); + ctx.task(lA.read(), lB.rw()).set_symbol("t1")->*[](cudaStream_t, auto, auto) {}; + for (size_t j = 0; j < 2; j++) + { + // Section named "baz" using RAII + auto s_bar = ctx.dot_section("baz"); + ctx.task(lA.read(), lC.rw()).set_symbol("t2")->*[](cudaStream_t, auto, auto) {}; + ctx.task(lB.read(), lC.read(), lA.rw()).set_symbol("t3")->*[](cudaStream_t, auto, auto, auto) {}; + // Implicit end of section "baz" + } + // Implicit end of section "bar" + } + s_foo.end(); // Explicit end of section "foo" + ctx.finalize(); +#endif // !_CCCL_COMPILER(MSVC) +} diff --git a/cudax/test/stf/examples/07-cholesky-redux.cu b/cudax/test/stf/examples/07-cholesky-redux.cu index 44421ce7458..26a4f9ac569 100644 --- a/cudax/test/stf/examples/07-cholesky-redux.cu +++ b/cudax/test/stf/examples/07-cholesky-redux.cu @@ -91,10 +91,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -171,10 +171,10 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block auto& h = handle(rowb, colb); @@ -367,9 +367,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -454,17 +454,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -477,17 +477,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -544,9 +544,9 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C @@ -563,7 +563,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -574,7 +574,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -588,7 +588,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -599,7 +599,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -641,7 +641,7 @@ int main(int argc, char** argv) int ndevs; cuda_safe_call(cudaGetDeviceCount(&ndevs)); - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto lX = ctx.logical_data(shape_of>(1)); ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] __device__(size_t, auto) {}; @@ -690,9 +690,9 @@ int main(int argc, char** argv) cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; float milliseconds_pdpotrf = 0; - // for (int row = 0; row < A.mt; row++) + // for (size_t row = 0; row < A.mt; row++) // { - // for (int col = 0; col <= row; col++) + // for (size_t col = 0; col <= row; col++) // { // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); // NOOP(A, row, col); diff --git a/cudax/test/stf/examples/07-cholesky-unified.cu b/cudax/test/stf/examples/07-cholesky-unified.cu index 480029e97c8..60b740b4f17 100644 --- a/cudax/test/stf/examples/07-cholesky-unified.cu +++ b/cudax/test/stf/examples/07-cholesky-unified.cu @@ -84,10 +84,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -162,17 +162,17 @@ public: void fill(Fun&& fun) { // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block ctx.host_launch(handle(rowb, colb).write())->*[=, self = this](auto sA) { - for (int lcol = 0; lcol < sA.extent(1); lcol++) + for (size_t lcol = 0; lcol < sA.extent(1); lcol++) { size_t col = lcol + colb * sA.extent(1); - for (int lrow = 0; lrow < sA.extent(0); lrow++) + for (size_t lrow = 0; lrow < sA.extent(0); lrow++) { size_t row = lrow + rowb * sA.extent(0); sA(lrow, lcol) = fun(*self, row, col); @@ -348,9 +348,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -435,17 +435,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -458,17 +458,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -525,9 +525,9 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C @@ -544,7 +544,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -555,7 +555,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -569,7 +569,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -580,7 +580,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -650,9 +650,9 @@ int main(int argc, char** argv) PDNRM2_HOST(&Bref_potrs, &Bref_nrm2); } - // for (int row = 0; row < A.mt; row++) + // for (size_t row = 0; row < A.mt; row++) // { - // for (int col = 0; col <= row; col++) + // for (size_t col = 0; col <= row; col++) // { // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); // NOOP(A, row, col); diff --git a/cudax/test/stf/gnu/06-pdgemm.cpp b/cudax/test/stf/gnu/06-pdgemm.cpp index 9e400b66e2e..6bb12f7633b 100644 --- a/cudax/test/stf/gnu/06-pdgemm.cpp +++ b/cudax/test/stf/gnu/06-pdgemm.cpp @@ -155,9 +155,9 @@ class matrix void fill(T (*func)(matrix*, int, int)) { // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - for (int rowb = 0; rowb < mt; rowb++) + for (size_t rowb = 0; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); #ifdef TILED @@ -167,9 +167,9 @@ class matrix int ld = m; #endif - for (int lrow = 0; lrow < mb; lrow++) + for (size_t lrow = 0; lrow < mb; lrow++) { - for (int lcol = 0; lcol < nb; lcol++) + for (size_t lcol = 0; lcol < nb; lcol++) { size_t row = lrow + rowb * mb; size_t col = lcol + colb * nb; @@ -257,9 +257,9 @@ void PDGEMM(Ctx& ctx, double beta, matrix& C) { - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C @@ -277,7 +277,7 @@ void PDGEMM(Ctx& ctx, if (transb == CUBLAS_OP_N) { assert(A.nt == B.mt); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -288,7 +288,7 @@ void PDGEMM(Ctx& ctx, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -302,7 +302,7 @@ void PDGEMM(Ctx& ctx, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -313,7 +313,7 @@ void PDGEMM(Ctx& ctx, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); diff --git a/cudax/test/stf/gnu/07-cholesky.cpp b/cudax/test/stf/gnu/07-cholesky.cpp index 1e40da9a3c5..178adcc06c8 100644 --- a/cudax/test/stf/gnu/07-cholesky.cpp +++ b/cudax/test/stf/gnu/07-cholesky.cpp @@ -90,10 +90,10 @@ class matrix handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -168,17 +168,17 @@ class matrix void fill(Fun&& fun) { // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block ctx.host_launch(handle(rowb, colb).write())->*[this, fun, rowb, colb](auto sA) { - for (int lcol = 0; lcol < sA.extent(1); lcol++) + for (size_t lcol = 0; lcol < sA.extent(1); lcol++) { size_t col = lcol + colb * sA.extent(1); - for (int lrow = 0; lrow < sA.extent(0); lrow++) + for (size_t lrow = 0; lrow < sA.extent(0); lrow++) { size_t row = lrow + rowb * sA.extent(0); sA(lrow, lcol) = fun(*this, row, col); @@ -351,9 +351,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -437,17 +437,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -460,17 +460,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -527,9 +527,9 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C @@ -546,7 +546,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -557,7 +557,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -571,7 +571,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -582,7 +582,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -662,9 +662,9 @@ int main(int argc, char** argv) cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf; float milliseconds_pdpotrf = 0; - // for (int row = 0; row < A.mt; row++) + // for (size_t row = 0; row < A.mt; row++) // { - // for (int col = 0; col <= row; col++) + // for (size_t col = 0; col <= row; col++) // { // cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col))); // NOOP(A, row, col); diff --git a/docs/cudax/stf.rst b/docs/cudax/stf.rst index 63c8d6363d0..65a4b1ed105 100644 --- a/docs/cudax/stf.rst +++ b/docs/cudax/stf.rst @@ -1911,6 +1911,9 @@ helps to better understand the application, and can be helpful to optimize the algorithms as it sometimes allow to identify inefficient patterns. +Generating visualizations of task graphs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Let us consider the ``examples/01-axpy.cu`` example which we compile as usual with ``make build/examples/01-axpy``. @@ -1999,6 +2002,74 @@ It is also possible to include timing information in this graph by setting the color the graph nodes according to their relative duration, and the measured duration will be included in task labels. +Condensed and structured graphs visualization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Realistic workloads are typically made of thousands or millions of tasks which +cannot be easily visualized using graphviz (dot). To simplify the generated +graphs we can further +annotate the application using dot sections. +Dot sections can also be nested to better structure the visualization. + +This is achieved by creating `dot_section` objects in the application. `ctx.dot_section` returns +an object whose lifetime defines a dot section valid until it is destroyed, or +when calling the `end()` method on this object. The following example +illustrates how to add nested sections: + +.. code:: c++ + + context ctx; + auto lA = ctx.logical_token().set_symbol("A"); + auto lB = ctx.logical_token().set_symbol("B"); + auto lC = ctx.logical_token().set_symbol("C"); + + // Begin a top-level section named "foo" + auto s_foo = ctx.dot_section("foo"); + for (size_t i = 0; i < 2; i++) + { + // Section named "bar" using RAII + auto s_bar = ctx.dot_section("bar"); + ctx.task(lA.read(), lB.rw()).set_symbol("t1")->*[](cudaStream_t, auto, auto) {}; + for (size_t j = 0; j < 2; j++) { + // Section named "baz" using RAII + auto s_bar = ctx.dot_section("baz"); + ctx.task(lA.read(), lC.rw()).set_symbol("t2")->*[](cudaStream_t, auto, auto) {}; + ctx.task(lB.read(), lC.read(), lA.rw()).set_symbol("t3")->*[](cudaStream_t, auto, auto, auto) {}; + // Implicit end of section "baz" + } + // Implicit end of section "bar" + } + s_foo.end(); // Explicit end of section "foo" + ctx.finalize(); + +When running this with the `CUDASTF_DOT_FILE` environment variable for example +set to `dag.dot`, we observe that the graph produced by `dot -Tpdf dag.dot -o +dag.pdf` depicts these sections as dashed boxes. + +.. image:: stf/images/dag-sections.png + +Adding sections also makes it possible to define a maximum depth for the +generated graphs by setting the `CUDASTF_DOT_MAX_DEPTH` environment variable. +When it is undefined, CUDASTF will display all tasks. Otherwise, if +`CUDASTF_DOT_MAX_DEPTH` is an integer value of `i` any sections and tasks which +nesting level is deeper than `i` will be collapsed. + +When setting `CUDASTF_DOT_MAX_DEPTH=2`, the previous graph becomes: + +.. image:: stf/images/dag-sections-2.png + +When setting `CUDASTF_DOT_MAX_DEPTH=1`, one additional level is collapsed: + +.. image:: stf/images/dag-sections-1.png + +With `CUDASTF_DOT_MAX_DEPTH=0`, only the top-most tasks and sections are displayed: + +.. image:: stf/images/dag-sections-0.png + +Note that `CUDASTF_DOT_MAX_DEPTH` and `CUDASTF_DOT_TIMING` can be used in +combination, and that the duration of a section corresponds to the duration of +all tasks in this sections. + Kernel tuning with ncu ^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/cudax/stf/images/dag-sections-0.dot b/docs/cudax/stf/images/dag-sections-0.dot new file mode 100644 index 00000000000..c447276ec43 --- /dev/null +++ b/docs/cudax/stf/images/dag-sections-0.dot @@ -0,0 +1,5 @@ +digraph { +"NODE_23" [style="filled" fillcolor="red" label="task fence"] +"NODE_3" -> "NODE_23" +"NODE_3" [style="filled" fillcolor="white" label="foo"] +} diff --git a/docs/cudax/stf/images/dag-sections-0.png b/docs/cudax/stf/images/dag-sections-0.png new file mode 100644 index 0000000000000000000000000000000000000000..bc15d5355cdefdd56af1bcbf513e78f9b852d48c GIT binary patch literal 7100 zcmc&(^;cBg*9Rm7=|;L69vTE3x*LW@q-*GqPEkUoTe>7hC5IY731KKvdSGazV<_qG z^_TZwc-PF_yVhNE&%I}#y+5%}qJf?|5y4Xe3=9k+O$`+z;9Lp(3LfJE$46FHc)$tQ zURzxSQjDF2Q8iVaH<-=Y9<`58#QK;+tm*Rgoy8t3eLrL??+A4@ekyk}KcPxS_N~8P+N36w|38tP z4{ll%eRlc3`hCpLsYzWE9#c_KnV6Uun3=u(y;ITNo_BE(6dN1+n23mzhbLis+ZC7S zX@76bqw{PApWLc0EWPTC3DCnw;hx)chaVq&Z~yx070GF;s^aKXpXQ%7AP+b=I8w8- z)tsHVV2gfF90uR5wj#ay>>K_b6JpT}blu$JMLpas&$oKghlGTLU++|lOG#1C(P^rw z;pXM#Ra94JjRTKB9VeyXiL@!L0P3&i0BGA)zH_Uy*YcYG)?4X@CXVKc z#O?1x8k?HnUeqw;n3-*a>Wddlov)6ZXFuDRn3|@irG1OVd1BrjF2k<=;cV9hmlixk zl+d@DDzzML%%z*p5lzVa+`1?nttsQP65Y1s)#JXiy`9?CB{w`goVKyyXh^B*JpPe4 z>T1NcqP)ECizT7bCXtiY+dkI48ED38-`nH?`!pHbfJ@^3-#cYR~hs_G7F@mp2?)}`b*7q{nu)0N?yL-MeMAx!f4_T+ zR%mF)#@5!Z;IQpklrc3mHKN|4NyU)TqTR1@b!@KCxLR)*eu?6b7O-wJ(9+gE&))F3 zhVjlqt+-cLSN-;&Fhr4j*je^(?pE(*A$GNQsdIDYArE(#=h^Gp*VVkpg$2he^kx}P zlHC2@^)(;qv+ND3{QUfl(zi_Hs)idsK=-~;aI)Q~s(L6itH!umDjvPTsi9r-|%)6>&|7x3@CzxJroEf_q2Esx_dhet1Y z&03{PZ8NX)E_q2M(+F>k=84^&e~EA{{oKE?C=dYhe?LAtk(1XK@WI4bo%{U$4S zx#({XLvF5oiwo7Bj6Xoal51JU?Zm!pf4~=n~=3cZVov) z>+Y*9dU)koRcu`1@b(7Ic`<=m882I9Al zj*dfRs>x^Bzs#_ZnMd%;BAz5RJYt69P}F&Qibj%z^Lz3hj?jaJlR4^5;WKn!17VUJ~h_hI%_xQsmH&tzcf2wY6H_ zyl}3spMz=cZ~uUR>Kg(A7m?bz66Ddt3*3|=VA-5dQo_PK#dOd9Qn%e@f(cl;)$%`F8*6l;^8jRrNWZBt&FjMu8C&G)sH14z{0FTw$S_*rokK(AUPr@~F|S z4;LuEH*|(kg(kR;=QcJrL<$f{eH5tePOLRDmLyiIL%v8a|_fj?X0xxC~FFJSn3V z@uckwI~2CVATqVMxPRVtr>KxG%Eg5nyoW9of;dU3I7==ry2r z+l%>WHf;tuH8q9t-^=LB(+fOIH^jI*@+iXh0)Mne7PHb5gnTgl!Ku$^Ns`uChnx6u z5VS-W=f~&<*9C*A?Ck9Bzi4*Fqo4~YQOMkXS7Y9h>*b4I{4V81%^|#}8&~=Sl{1q< z?_Dm3`A596Nj+jmuZq21Ezjo86x3U`XmRP}uEc%3Q+kPtNKO#2=`eAgC^|R~N1|-_ z$Oy1(zKdEjEzd@Zysz683pqatbG+4UAa{pp`UIpeKHoFEy5n}Kgj+?2HtxPy@E|8AXKQI`3Dp*%92giVtE~+`{)QK-y(1Vi&x!pt_{3g-`U$<1 z*OXOKY4F|oY+_1^%S^om@COM@55)1Y54Di>P`-r6_V)Hqs}HQMUS3<%X0}e7h^#q? zBz*HMKU#vb*H~X(72p7m+0WBfNvwaDjw02KjVWYgWPoAlL#0@*5e_NxTmbK+BM@5f z%55z}!!#8Ym7(AIAfTMO$>-w2F@Yr>Mj+O<3cR-f98odE^89MC450tZ-KiRJadDsH zm2bTV#Z!+L6}t;nR8>!^L3fFavO!-%&)wmh+u!x0u8q;Br}NEj#wGMd%14b0R$YJB zk^y9`w}R<;^F}(I?@j}=hb*=kdh?>#d~SHd)@XYE#+yS;=fB#;p^R>o{?MEZ(AWZz zklL}W3g}!y_fzO6l=0k!GPNNOvbl@HjVM*q{uG7jdCQ=xqVgIuK)0(Tz?k*s4N6bx z=>bd!ySuwP-rrsU$meu-eGZ^B&fV`!Ha29dXGU9_w5Gm(PTyOSJ;5LR`wK1F=@}WE zGBOrHM<^(a!FDnV045_#OTORL`ZUP{Y$@^=xl_~AHg~$X6=h{gOyS3h0Ay`V6bnyJ zPlsxwIq#5}Y?e132MaCa%*?`ZcqG{p9y1yq9(Ab3&xwPBY9SZ%E}UFkYQSSQ0Vtob zN(XH}RPAqkkADJOVrOM#6?GgU4Oxw+A++){1%rphCgj+CeSOL4=y>(z&ojj(Bz6Qx zY!{`0dv32z|J?&*Iyp5p7wGRd5=VeiIwo%Y+pQZtW`!&|0&1{va5PO!GH=m2J(>CW zPi+Esdf&Ss^mcc59mnz|()03oqX`+GnE>g@AuO!FfQ*p1y)d%0vT_0#8r{cyZvdn! zcIwIcj}ebA@F4cAcT4!UF3YHup)_{Ld}F+zRjbchKfaQ(a-Os=8-UmE7-h3ZM|Iv1 zGcHvB+4X`ItgYGq6D@!Yn1_c+;gK-)ull5BWeorfJp$;Cp`oGUwLBb-JC`wkZ$oqQ zp~I!l;gJys0L3Y(s4z*%$P_ItpDMp0v>QrgC1jGz{q%`PNlEE3F)kWWCym^3?;|r^cDBW+hr39F-w*%Q z9liKDGouE92nY%an(0t5Gf!Nd{%J55h%}iI{%=<9i0Iq5Zvy6DSJE#W%pj8VBiVvt zMDyA@#uw?8+Bs_h&%(mTb^;n5hsi6M@KhCgnOneEEF6}mm`2;W(r=3Nq)6zT9>sslCxLOcPyffc%EKd zJh44ljyYBV!1cd+aRFn#w=lZMs;?JNdq<1e+n-3Gs~n1XI)6#!^Q;gkQbSu?eSd#3 zAiB*hEt3lip_&~X9nS_oFlyV`u>#7!>vUWeKunu-E&&0;&LurpSDw<+QWZ5dax$`* zY4;cRMQyCLN3E@rT3T8>fc)?3`Uq&q+Pb=$nwnBT7$*HPa<8p*g+MAkm&R1hn0^$> znFz$0hrGtcA)}yDRZ+oyL_waQufYOvH2`vJYikJriTo^41m>QWnHgX~V*;v(+P_>v zq4a!xMQo$)B;w=a|7Ai`wZ5^DiHe&A9hJ?qHn1w|xI3pFIFW*NX2-GQ@reQlDb8!m zQia!l%s+A07dhA@9y2@;9rO3%iwA1|#%POnF2Qj!F;5lr^j_wRiL4#X}A5e6e!g+psc>y^<69P(3@95JUusoV21;^Pf1C6 ze15Y2@tJSz??2;{tGx4X<_-uSr5Ea@t7zc9!5Z!*eaW(t{LPW^Z<(&o4Ql^t zA>de1DH8e}yRos6Rz6gGeegZIxVVY!s=Nb(aS~DuCy!YNvt%Qq!^r%Q&^o8|-m_tnI%2%h!(w0QHczFr0&8 z@B@{qA;*k!hB~EcP;5YS0v#sB0Sj}ifEi_Cqqz#0GB5N8Bl>#pi#9->!u!TMVB^U1 zX|qQ6X}&~4qAb#6)olKx@C_LexbSDXh!oS*SBlg%wR2#weR`cYCBkO`}j1-DW|IFGDV#KLL*C`LX>cWg?m*Y*NPENmxt`MvtM$@`|@!kOu zJ94#NS+B8e1*#Nf?2%?*`!J&c#7}m6GdDLkMcHMo>TCHx#*}U%y*U5%wHcD2Kb0k5i9Q^5R(}3G-dQCVdiIHsUQ|VG z;lOVWgL@y!gWV8^VUIbF{Bes`bdG%;!&4L<eUI@!M!YRhqpxCqn~7 zmn&|E#Y;uC@9phUFWjknH|o=iDYqWq^)x!fRe4e`CJO5dK1S?=({$N!!nZwVuWc|o{_iZiq&0iyQev5QU(L(xMT zGC}+1M_P!aBm(O3;)zRZp_%q2>`_;_!cUlsu5qt*g}51le{8aqSH_2JjkY^ROdMKI zX4c)G+v0me9N{DJ?K;1weZwO2>kMDyk-qd-$2)g&y1fP6uOslo_&}FO%fiu+ONSHG zc~!Xw4~tbth3q@iffQ_fEKRHs>;rt|+1o`H?-(70&XdQubWhM>srg=Roo35KMQ1jF z)-{%1j}o^PVT>E}`cG+w_*vE!loY?qcc6~9_j!wF6^n4_wubo`H^!KrFXN!|xqEz4 zT_-}G2Um>Saw#59muknrdBJu%O5S_l9Qmzy{;~%sus6~Pc4yo-lcgpoxavA{BoA2aZ{!8 zu@Qs*`+v-aTo{HzQ>jzQQ+JB?(5kR5W~_#91L~bn*8`+uDsPeaY~8onNz8Qnrs0Xm zizW{#CnZSztu5#Y+|-!MC!{7f*|}D#gJI-k!}yor!NV$2cjw}E{je?e&7Q=jWHWuq zOO|iXdOp_FSZDaadNqIYQxey`52gwD7Jk=W}Y*lVg@++g_wX5{-Dz z(WFR@HnM==KSrpwdl<>ljUKFr;-?peP=HM9bjGECkoSm?GCf7jg<_mA9nnYWz}g?P zLSjw;YnCht(h*_KU>Zl$inH_e>MF?o49tt}{HbPCjRnHO#eiYB+>uHe8|PR zpGx?R8o=} zY{+x@wsCKRbA`3Jrm?tutbw|O_N)I6i0~!bF_+~|C}D2gjN!d}GjD^9Ve|tL1h)tS zrCKnuWQH4pJ`{k4CV?(ky%h_jKIgna`7tFBlN|Z(9ywJJ0=iU8zHsT@VCIpXD^1Pm z%_*fN)jIC|n;>8RpISfkP71g75$wJGHup^Nrh>~v;W4I&0M(NU!|3Ws!dqKcu7fA1 z^3N{MjxAXiuI^h%Limd=Ux6MT)C%@cyADKfr6KRd{a)c3F&OV3+$mG>@p z_t%?aD5Fh6e(7N+o-;7xHWE~P&#)ECxm4Fv>>Oc69RyeW34R4P+pP}o($4=Jkyy$E zl`0Q3YYF|;e0iJ)DGROXN> zmD>$&5LwPhpkPpEccKH(Rc=&y6R+8^B-Bad^UIl5sx^iK2_dIp6a?CcZA=9=50prN zzY$AKF3Gh#jXL1UpBOZ~gzNo=sbK=tn2tLUmzqvao9CAGm&s(1g=WoPM#ckGwdk+4 z*MZlubc4xQshqWZwPhMJg0P{B17w&7v>Teb8t}{s@Ta1XAyRxJu*GK{bRMQqD&2kZ z!`s%@R4|y+VMAlSE2ODy>9BhaA?txJ!$2GYQ19l&J3>syk4r)x-#*f#^vf|fI8&|a zO))n**9V6O-_w6NSKD$Ael;yYmImma5%vePFu6TU`_lkhGNo#3?6R|R{%V2iqj>^g zBAkovSu6#*)PmML#gW%FFu{!o;~=9KFNB1OsW=!#vKH4wOi}>wbMRYBv#quY-Bc!lkC0ne_33MuBkQpJ*jAJ zKuW=_^AdfwGny}v447<(hHXjYvh>Y%bZg<#06YMVp6l^u1_uUWR4g34JKSa&IpBIS zgz+Y(-9U*%dJ-4JtX!az>S@vQ1`J9|8p=lr@T~V0YGves^YH+Tj>G=x#fm`>%OE>Tdq4UUR#8^<)seEI#EY zJ{(E<0X=R#7(LjKRwjVQV*Q6$=qN3qb%ziL`=2G=^sm0i4oW%c1Dejjz`#dDkDAg4 zJqMhqCf9s0UVa4)sbuV2n6C&ZC&4HkS)4fqb^I4I=M9&iq!oCd`DEpa}HgT{~UehBHsG7^HtFG zH0GIa@Q0YzM590BErAM614_xYzdqGYVP!EgiU%Tp%{w=c(lF`*9#gZXhNHS7H9cL) z&rcNCg#c_{CZM8Gw&LrPJvt%KfWylr2|j*)H&@q^?(QO>6=7*<`Khk9tfnC?d&@aL zznLh5K0n_lP(;t)ivED{A-8p={p~Amz*#=lccTIt8|J~%rFM4xgbW&h`vckuK=+N5 zf} literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/images/dag-sections-1.dot b/docs/cudax/stf/images/dag-sections-1.dot new file mode 100644 index 00000000000..eddf1e29d9c --- /dev/null +++ b/docs/cudax/stf/images/dag-sections-1.dot @@ -0,0 +1,14 @@ +digraph { +"NODE_23" [style="filled" fillcolor="red" label="task fence"] +subgraph cluster_section_1 { + color=black; + style=dashed + label="foo" + "NODE_13" + "NODE_3" +} // end subgraph cluster_section_1 + "NODE_13" -> "NODE_23" +"NODE_3" -> "NODE_13" +"NODE_13" [style="filled" fillcolor="white" label="bar"] +"NODE_3" [style="filled" fillcolor="white" label="bar"] +} diff --git a/docs/cudax/stf/images/dag-sections-1.png b/docs/cudax/stf/images/dag-sections-1.png new file mode 100644 index 0000000000000000000000000000000000000000..6f5ca311117f7e31f2e3fd8c79b676b38108f4bc GIT binary patch literal 12081 zcmeHtbx>7Nz$Zv4Dc#*I9nvKo(k=CnknTpLQy!fnozk63NlJ@!OFdc|_VDe_&VDmH zv-8dD{AW!{3{g>Z22oX?TL7M&;jFYpW5 zTNmyTj692^y#f{dh=SJq*+x107>$EEnKGO;R77##HrvzPL(WL`)UBYvks zC6FC@2Tv>$wLIj{SNYuMcF*|IqyrzD@SC*Z@OK?`Y+Gg|Y`n?Dm+)4oP~Ml%xDmM} z2eY!W&U_yQ!+9g~64oq_aC62D#%bnvgvK9WqXOe1?`de5@cOCe3YwmUe&rZSLj94a zT>6Uazr0~KVJM+Oz-}O;s;YW%)ZufnKG*edp|X)HUUCgL2rws@sOJ75_>t}xPVsv3Ue%|FOV+|CiWXl=)Ws?2; z-z9CezrfAo)lS9Tf-IT)q@BD`C<@ohrB-*S)o{u#yR=Ny=El#5kB@ZvhU&MtJB6{Q zmi1kF`4pc8I)&MFhljUrE=y0_=KblqaX8}JAF)c`%NWV>2@NG(xbSG2q>kst;6Y34 zMy`kXoM-Mzc(rrl8zTr#6DnF)?cW*6Vz_9XEWKpuGCP|cj(AjOi1}dH)=7L!+aC`L z(n%UIg;r5xM3ghWiKn1vW>(O}aMviZe~vF?Y3iAW)L5JyxP+xr|6zpyDmXaW5>*cG zd|`R`r{U2>wTOaAlOM55^-}{Eb)jyHj>)AoA(HO5waE&%gJ}v)gmo6%eC7F<_R?N% z=7I*-CXM%%K?eDlR=4w=tduW^M>lU5^7yE#zVZ(xVbFYf@py)*cHTaAv$I`8vx|^m59*IjF^a2(Gi zss6=q?%-;h6#Xv<{$rzY!OB(fR%P~1VXdkUMr1ikG$Ch-WC-#udp?7Z?ZGYP8}@K@ zCI$8QdB2FI)8FR7Sq;6W zpdBmb)%jNXwNJmzqjDmQSX;I96~%v=Qf4J#aes2(!m|`>5xtb&N{UK(Gg_?0_i=Jt``ylOS1AQj)s9eq?SgWpHrt)%Ep6;|Ig<-@pHzE`=^9`eRxR zWv)623VIz=fv2@~b+VBq==@Q$SoL#ucI*%5+uOor9ib^70Ozw3sB zbzGW^K1Zd5zXi+}74sQgX6}zW&dda@`h}E!0#FSWL|k3=;7et$ji&W_o!<& z;^@4Ey-tKe6%-WA=BkW0H#c9Bk`5;^Yo@*S=Y!!>?mxY^NeE_u>Vb^p`o$9v%?roz@CUk$ln2i_K55RQ8GOA11JLATSw%%uO21=x<5sr`L!FSnGv$Q2{{M&<8b8pov&RP9-_&@Y zK{~9wlXYx{qM}f@M>7X-)kJzU)kOP(5m1P@%s(yHX7qk~fh(`1v@!99(&}{cC%D3O zH{pwvEqnvx9dAiVcqpUVHh60GORdS_6h88f3Zr354%QcYGclwBaY$$cPOwFx*-FDV z5)whnlNrf6$(q|+Tb~mWy3={Ae*^3;=eAUVjAbdjeT!3CS}Jnfjeti;xX)gy4^V$T z={l*euTMEk@N;4!&Og}FzN`B{7`OfJS8p{niA8;Q!Jq}$$Nt-%zr%_Tt&*nLt;I;> z-){@Q$NNja>#fwvDYUn6t-7vj~4B|cZLguS~5o1yKtZ5?N+|ps_6NXXLUYnoa5b@g+vFcR(GE{^f>uctS`DaW| zYvlyo*mM*UMHT1xM2%(&Kr=JTo08-#0N7dv4ylRXCxIyn7;eW(yR+|nck1Nq%wnTI z7JMLH08-N6$p}~s#9d4Z6XtDh3ga7~~VI^yQ9FLs7jjwG z+`pEKZ`#?7P`ASP*n{uq&JzFqfOnRCPiAQHB*vF3lr$tT04iSeOS^Xfdf($5WHs(c zLvx}=*5VgC>}vw~ixp?$Kyta{N$GnD>!@s&Tg;tQuRL1FR!h&#ur|0_yr?yY3 zWO^NckKZ6JeBFxPXk$)#Tt;?T@s??YTkSjJoe9TWMzaaq0-;=#&|mCp3)NAq|IWKY ziVq*&D4gw8{b0=XqaldiYJ^OeUnh;qn3zz&<>lp>D6C8e5WTpZrFg@CNizJqZA6RW zF{&UMCbf99CZ5>vfjYY(g9{;|vkhgtcB7zB_x_c$5}A0^C9ERn*DrMdP0a!Kt-41W z?0%C;R#sMUCYiN`^C2%L_e|`SZW~FOPk;XO%zRr#zW$?7+{)6P?(kVfnhHqjA9qSu&gmV&>;qh^lFaEdxgqh!yh}jI0 z_BA%$tQ&c+)E;Raew|DI+`Gb^{Tt?;aNCyl#a=vc!7yU>o{{dsp3bx3?6O_CM`QQb zXu@g$r$^h8HRB`6P}0s=w#Cmx`lcA`riI<*j@RK>l>eeIp)2@eihjXrQ?<;lS_z(a z<80|&F$mk8Y(5s)H4v(IaJ0yOoU<3YYcC&EA(HeQ1MXu*cX7G_fe}8oQ1<>j>F|i? z=qU|J+=;f*89m5o20bo=^82Fcti+S;S2Zn4MqHO(st>KVqWXd?2EvGXy*R%$%=_<@ zbzoxz*0GM-R`wdpXU2)8x_>`Vq$W-j2^1bb8Ym)*`T;>fy|<#u$A(VN4IND zZ=RpuegwfI&~b5{G_I4>TpZRF8FlzHAR;0THrUObrz-?#c z?7aG#s?GO&h0jVu<1-=>(oe^LGt#H&t83yd3#hxGb`qsnsO4ldK+(z5S2>eE)^piA zyo^QE&Z+;2e6`DUxB?&iDs(6_&<#*>`RKccIlGi05C(w%!OL&adjM)Mad0qjacMX* zt4JTUx0Z_B9@OCw5HJV{3Hc~ZB#(?Jzre&y(KYqg27B~frb-)i1!P}cdBHkw9U2{$ zqKdpd>w6=#w6(cxr>rS&-Y?L3f!|L`4+ zGa)6V|K?)fe<>j-evaAkmEV<$0T?8A+Qc;fD^EqAYY3KNzDl~pr=m#V}m?h+To-%z9FHP*YWEE<%6s& zvhDHQ@iZTQ{4_R*bld`@yQ=XWX#sm)!k3&q%%&J?J^zq}f z4yzd@2ZyTme}C*I^Q2K;ym;|B)igjqDmuEug_D-w4MwE7yn4Dd{Pdq5ZN>*4ZESkg z5`bgNZ}o*$i#0tC|IpV#MsdW&#cigFV(<2JyAtp2{Jt&grcX-X;o%LwK5D0Rc6NRy z-{i0a#<~X3UxxO7boKSqpEOXw!&$D9msc|!x7YDIeK7pzT>4vyoW%;q^G4bhX02N^gfjoOjMyBlM#tZJNjGWwL zi>vjYdK+m83Al%c2QU-W4Ga`zWs&IV>G22%5`k3G(IFt@G}#1)O-f1mYC|$HF%d&1 zl=$frqFgMAESRJ20k+b4%HFJ1rd?x^xQv1UcY?)gx!Xg@tU!Hz1~*+@OH0Pah8ZAp zQfjI$WsQB)0=VhaA|hFUX=!L^BI4qp%na3lBWn@t6FUpV@=*HfFiYaa~%JUR=zbQ z*=hl=@Sp8B@CvG$W4Ai>6#wyF*bp;5dp!2|Eq@A$Y&4JWKl={)J2-7WR^WjPzdJGg zhZzPNL_)q(3kwSo4+LFSR(57?Xn0sC@Ige>=N!yw9n+)579Laoiy3ln zWN+L-J}v2Dz{(PEfD9`VUFP*JU#_U{5cu9lVi9_Y8Bq}cA2m0V`cqUkd6*t{?=i$Y)${fZRfd&1U}}GZmdyrpTj7HLW=Kv>Zm1F>!=&&% z;L65pz(k)>+qFAmeiK6^DI7etsUzQ{eQ7M7&-R6BC~wt>glX0`B7tumCU6(DH$N_EGvLq^0VO zONNPpf<>|UcOsumJRl2~QPpX0nz^>F&hd|p2_=7K?D4VF*Zh3o?(%_7MJIX{>v zqsNAbj9hC!&kZ6QDSdrnaH{}6CW9&Dv^z-;c9ik)5gv#m-`M;Ko!JFOGf^X7j>~3} z&fMJmJissii?TQHi?bE_KY)JyIWj^A4i7LQF(E-hO6vQE%)+v=y+(p}E&pd*xMy+A z-4YS}ceXN<&9HG~|FX2CBmvB>q2sk5ND1JR3?CkS?YfiTdCF|!qEOKya0N25vcR|JvKiFNa~L(RCg1WB7OQ3_ zdY^4a_L+Z9OCy@^^l#IJxo!?(&$qZ1>rOh*Rjwp~r^_91=QY8r0*0ZZ7tib#ko4T{ zds^_80QOu?*83nTf#4_i`(IxmDs_>Y^?TUae0H+{N!0-IJwfYn4~pCFq^zSp##h~M zCVv0st?d^YW9#dA0f7$yYX~f#wm~5?D+^1SfFp&P$O)pF=mj1MCfU2*2yCBWTO6RM zsDQ0}L+PLN6UqopON*G=oiW32wG4;2dtm>YA^~= zGqZ+o>CYMIZNqobqK1)XUCo*yi=&F$>PVORk-QBYLG?Q{OVNHu#Xh21dV722|{ z4ZhHm-KZI$*tk9F+5sxb{vxbh5>V^adNk+nCPzx(%BKxg>&(D`Yuc~3GiUnrznOGA ziIt~ef3ZKuUaA4i@IjszSi&BrK9lg(0X4qJrk4s zSe8(JQ&Lgl#u+X*Ad=AGEgB&qVrXei12&B z@T5_#kn}x*k;YiUOqhjH*R-#cOZeuUw%sxn+Fm( zkE7*LV3j-S$7Gp=TH+oak2)wijV)T3K`$~2;CvGi57fQ@{-nNYKt>uN{f+5**B)=y-fk_<)f-yH)D&SUTA z;koAa9jmY#fo~7Du(o*MxR-T|{^RM+ULKAWkBpBKN+c#GX6ObLU&I!EyBm0j`a_xR zXN-gUF@z;PPf6gb<*1-E!N1qTTUbhu=_&4m+(I?CsgqL&4;{i+YN_JM0?<2QfFu&YPB^kwVH>W1 z*$7}z2?S(DZ5}F)%wg*X^L6Ij>6RebYQhf$G5INgCLew6kYi&_0(^W0b@hb1ZGkQj z|KxdXCF|0$u_?2#`a5MnNmmYv5j~p*>TRaHcCtN7$|mcOt;rM>74;fy@y~X~yLNjt zJA=kEa8|ccO&NH2pjcv865xdI04{*k4PbT&0INJtlO?Sj|9|>GA^=7w18}AYffOO4 zpbVFP>nsC%tlth(10AgD=O^O1;x7OkQ3q{2cU4)L`E-c}5Y(rKO)EF{j|JgIAOpup zSMv_x|M2G+B#3~fxe@FBr(_R!7tB|C48|vmWoXBEw!g8K888=c~lh_7nJc zdg=nmlbM~p!zL?^{I$#VsnwG~WU<398GTL?($yudS)^R59as@LV`~Kj-B3cTOnU9- zkqkbx@E;mS)|lZT;Epy{f1M3s=_FOq*2d2kb}w6Yu4Di`^cH}jIZ5qu3H+7VKJ%Ja z7X9e<{DzhLI_1fr$=NR`bN`||kN8x8fZ(9g4jgNM!dID5wh{qc8a#HMQYwd~l@&a! z^Cw^^kav~Uap%h6uc$~)@j}LP6so5!;ova;Jgo&VUP5isV;jkSt!u5_89&6FDQ{pv za({b?vd93cDmKs)Pw6MC<+}_O;ZRbVc8H#V$R(LN6Q1v%|C9RzJ(++|pWK1%7IXPqy_1s;L2Nr?0qGA2_p@%&`DJU0#lLI}Rl#|mdf@q)n3+Fd@ zE1>5yYIX_+7w)_@gnxQ^ItY}!xOi@RyGSjk37xn=>esI{sHkYn9IQXq+ttnR6#}2? z@oGSPaBy(-?)ofC+%pa>J>96?E%Jbbe33yVKcr6z1saJ-^GTS@4JVKG<+FPD0|}~^ z2Q37DUl5EpWFY!hJqjG7uEIs__c9}9NxV=;#q+L8_W8~?-e6BPZb>t4POUaI=-M(^ z&ShI*Lz2DF#K^)DRni8dDYMgUb>Ei*0TT#{+>Vwz_%=H`t%hoAxxonPU;7cRpr*D% z30(Beroxz&mB!Z3d72OWtq2;JD8@1o&syO%B4Q}jz2^M5OLdI2a`BL><24YBgq4&qWzRhG$*65^ zP6N3^t^ch@=gpn#m`Up=*{O&o17Z{XpU+ptsHCu*E3=M*zXgkSJ7mUgV4`qfoBN!< zyQ7VNW%eCWE}lXKNF$0;GhW_uLb5QZ6v( zOv;P&8&cy`Uxpf(8LRQ)g~<6Jt#Sr$B#??tNn}Tvr(mGmZE*&!sZj=w%-vq%+Sa_9 z-L=emj+O5sMVp8~hhV78ASRIBtfMV!@5Vc2J@(YhE{pAP5}rH@4GleA_C5NU5R4!@ z#rvYcV2fD=N|(U$QlrOOihfWhp(TUi!w7`SCL^ZxNbLFtv->niJ;djj&q7~@+5?M(9=Jd3P0pl|d%TKG`7v*_~N8>B(# zl-!@H=I0M8D=fbVw5b@TZ5DYfTVirlg{#ghi`q}+IGK#*Toy6+zup0n9a6#v_Oy;U zd$(zHlEHqG6UEPyU^jSg)XV46qvu9clp*LnOUK73#LrI&H^_ zg3M__z{=}=N&E75DaW~_wYyiki*XBDffdf|6VS0xVdux_zz>S4JL0g;%=hn0n8gQ{ z;4&l+$$p03#t&m{T8i$nRY1q&77P2Md?s^-v0U5us9rQ<%&TLtV+Xvzp(OS7#mELU z#<8$%eQ8z9fqP!w)}NqPu^)`eHROSRCFs zQ}WDIRG1>q!(S7je-)PaM`!#pKx^mHX%Qo+oUiJ`zXH)5%2}(O)f`+K+mFbOlazwq zH*Oud_uWY#hze?Ju;fL&dP(%6{>KLyjApbFL|X^krI%% z8QSZKyQN>cy7eaZ#NK%J5O%x8us%la7|~@mmLY<~R(|43x!89qv-%2VRK$?M7>C>* zwb1<-y-=XCrV&53bm5m0eCyq1FZCy5=hi*)amS8cwBWqLO$h4mGs=*oLeE(pTTn&w?H{=f*k$X)fDMuY%Fl~la#WnXmSu8FZ23mU4xE7BltMV3tXao>(d8H=;!Y{NIgBZ(-~E*L@y7beQ^%Yh3D?=+Hn~NSrU43>hGOB87s^4 z&pe)Hr>6>ZEHCe^k&)l6u@nuBWBWV^0s@6oyGhRZ zZC7XCojM zHSeuV*xRPd0xJIhHr5DK>>Z#g(_d&|m(j^qlc zu`xi~WrlTQw2w#CpP$B9EA*a?i*u?LpLrb2R)<=2-5)yA_d28!WvaYIKmNr+B;bdM zW7VJ--uPh)#9cNKqa>ZZ{{a7TJAX$6967?3PWJD=KKsp-b3)W7V#YEtltD9^!&JS;i<;cB2&_O-fnIg`TY(NEwn1mgV-S=!r#&c? z@|T11Cp_tvRYgAqki8^X>W!{m!#YC2Rrmgbv6J%Kzf2vT!87C=H<5Z;1qPbdT|!dU zAoSY#O$2qjABHweh~8KQ-}-2R&`EjaK@02IhLLVK(cV@S89^rKB$0D7KTZc|M50c= z$S%V^jKC11^rNSg&>$p1hye=j;JS6g(7DZ&<#>+h=!d**fh~+GqZXQ{IyNr^dFD_4 zDeE4kv`&v#9-<}MU^|JgW*mi>QNZoO%2WK)VU?XlFmG}Lx}OTut{kbQpLt{yx7@FrI3bbhNS@OOHJrJUCNiSjY+ z1uny{7S|;tCKK5eS|wlzOat%vUV9w8yR0%k5O)eX9a0otut)b4CWg6j!~I36pHZ@8 zrJ=k;+zIKPW5D%QW;iV`tG>PFavOXp7eo95kSH(78j6&%h zrhzIBlJIb%_WU=x+So@f*jT;KmLxwUsbgU-LUZFcOq5e0{7?wLYfl{o!&hk(n3DXv ztHIAPL|h-ias*oPTx&MX*@3=jLQt{0hD(RbFLdROD5bu{wr9ZYsL(Md!~AMowFez& z-7vYI%fQ(0?)dkY3^vuT3ct&MyQ{-$O^!*%K?wYTXOlxO7KQR7JVHc#JfoOcPIYx4 z-;s%DjN{M^6B`?ysAz|9mZI9uG*}@yG*VFi^%-1r(Zsy1d+xB_Mpl1Z^SJfFbGKte z+C!*ToNtQ+1x!1lCC6z)q(54RbtjlL2O%BqBf_%?*gW)av!MCSIW<@r3Cup|v8osR zVU{#2yj!C$u|eb23zjg9hoTWSd!5h_au_+~z6ah0tdFE=`T|!xtiA6`O@L+wsk_B9 zOpTTR-^I|zX|V}{@gqK!1GU4g>cZwZgrzUvT`?BJnU(}w8sV6@GMV|(nYh!NkvFxp zHH>!Tt;^??!BpJ+)4*!%b58YSU_1^%jfMB*tU!YIDLh=@p^!L!)@o>BtKWNtPg+$K z6DZy%Q#d*KR#8cbd2qC_xg=sT5hp5o359`>fN-P2EGzNUg{AzV#s09Lcv+wtbbG!o z<*Tdj4kRTJ{~XruPe*olOB{H-?&;~Nx0{Ot?no#1%zvZe$m7tVp{#Aj**fE`*?X56 qv+Dn4A-P#kM^BoR86N7RxRn{)+tY(#X|M(!PC-^xrb_DF$NvWXjI9U& literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/images/dag-sections-2.dot b/docs/cudax/stf/images/dag-sections-2.dot new file mode 100644 index 00000000000..8c1da4d110c --- /dev/null +++ b/docs/cudax/stf/images/dag-sections-2.dot @@ -0,0 +1,36 @@ +digraph { +"NODE_23" [style="filled" fillcolor="red" label="task fence"] +subgraph cluster_section_1 { + subgraph cluster_section_2 { + color=black; + style=dashed + label="bar" + "NODE_9" + "NODE_5" + "NODE_3" +} // end subgraph cluster_section_2 + subgraph cluster_section_5 { + color=black; + style=dashed + label="bar" + "NODE_19" + "NODE_15" + "NODE_13" +} // end subgraph cluster_section_5 + color=black; + style=dashed + label="foo" +} // end subgraph cluster_section_1 + "NODE_19" -> "NODE_23" +"NODE_3" -> "NODE_5" +"NODE_5" -> "NODE_9" +"NODE_15" -> "NODE_19" +"NODE_9" -> "NODE_13" +"NODE_13" -> "NODE_15" +"NODE_19" [style="filled" fillcolor="white" label="baz"] +"NODE_15" [style="filled" fillcolor="white" label="baz"] +"NODE_13" [style="filled" fillcolor="white" label="t1\nA(read)(0) \nB(rw)(0) "] +"NODE_9" [style="filled" fillcolor="white" label="baz"] +"NODE_5" [style="filled" fillcolor="white" label="baz"] +"NODE_3" [style="filled" fillcolor="white" label="t1\nA(read)(0) \nB(rw)(0) "] +} diff --git a/docs/cudax/stf/images/dag-sections-2.png b/docs/cudax/stf/images/dag-sections-2.png new file mode 100644 index 0000000000000000000000000000000000000000..aed7fb4912a056771f7ce1aac0a14972c739f60a GIT binary patch literal 42824 zcmbTeby$__*Dbs#LApdz2?;4dx}_D7R!SPAC8VUgBvraWq&r0E7D2jEQl*xJ(s}0E z`~2Sb``+t2=a0{Q?F%nlYd!b#-1DAu%rVAX5o#)m1h~|=2n2%Q!F@Rm_<4#zV5s6; zfxpqV7wN!{E2a+>60E#puRANh#lTkN8^PHL(MKfBNt?bp63? zq7afpH+h-t|7w78L zoO5fx+148e6s1rkT!d!tR9d;9nj-{-eL0Og)deI!y_W%O9(>niy6GxJCBWsmK>xN2U3uDVm@e#;^eL0 zsK2-y%XJUCyVbEOJRf}V8Iy%aAjgQci0a(6P~u7o@@K2CF9rj9%xi<_gd4P2WC6Si zbew2Pgl5W2gtWcVQ|{NcnokgMKUr3ihC=r&L!4Iwe&@%l$Tdta z7UJn@UhF@~$o-^9vS~YQuAY=V3!0k`HygN?Fc9^YLpNTgPIFAf`wVAGQ}SEI1xmVw z%OqsH=#VKF{PAgD&~wk~ zqbTcsAwP}oki}xufXRF9eR(wM6rn)>pUc(@Zf>-2kC_w8g>KQ z>15ot$Sj76cN?$n{*dYqiW&<|s1w0-olY#gqjV6eRKRy-{{x1$Gev9fue{NBSh?o3 zmrU0O-mzXe6vWUDvyY(G9LK`!NJ`}%=4E+wnefOer?)mOv~t^QRL9Vo%4>HGUtg*x zW8HS93QcOkJb$~2OZ}nO?U@OkzJU0FWDE>LFC2sEQa+1obN90{Eom%$+C)Y4c`)eg z-%k}@wY@GLQWp8L>q~L1c9MO>?99V44U$|cD?N4@eyoWavfL|$nQ$a>!adzYqyH}_ zLZ+NlyV!zNPJ~>2NTt)_>Igpn(?sG&|Hq?LY{@D|pe6xFPNry28@J zf`Oa6yl2YZ-X3*`+P5mm|0rwgCe_WCmX+z6nwmyeC@LyyXEN7)VWOvBEc-onQQIoI zeaWrXrsv>HkVGfy!iJz{XWz(}bEXZG@H(L9}! zo@h-yKc&Wdia153g@yNv&edT(HvW^aG}+-3B=v*PMvg|Aq?I-&+S+u>rKsfXy9s!(~D;9;75B zIpY^=pUV#@BcCHXkiPaNmZf+GvNR2csZlq{su{d|_?!ea4!nb|wi`Ou@daeIP|=D9 z^8W1gFju;qDcxISOU5xYJX_h5tvvoi;w>_OJ(w6eOo7w3j&McAWyYhm-F+2- zG{%jYgeUt7yXbt;V7`;ZA#yiyW`CwF^RwYX zbsSxm-B7Rs#>^yg%E+S~KHZF1wDM9+S8HKAcpRNvZC{~a*G9#79?SQJXhJ5L;C6`N zpv>ERg_ZGzJF352vF+r0w%?faH@D9t2 z%J8Ej4-3}5f%@z-h{!!;F z!$oo>JJ!LN8x!*iR@dDtJ?|P?-;22*hM7Nv#;UALq;s-fU45=CiIwy|Lk1~v+hz7j zjyo9|y^;cQ2Hi2{rg6`^El0sIul-k+#7PmiM+7jL8&k>Fo|&FTiA?+4#$$B4?W}RQ z@^`m?C#h24J$F{*6B)z_-QuPN&P-6VPW=J@*3Tqq7T=iDT`9BOP`tulbE4|#7E?8v z)g>9)Av}6zJMxi~=@V+4Mx*Fed}qVs<2Y^^QauhQIXc&Ta5+2$(6JFRLdvnCkw5b? zoc=ZjSXo=+YlaObS3VA2WH1~~v@n&8<96{*T=DF%5uQ%mdJc&b~mrM6fFuE{fPU%?--ekPsz2c)z6U6%Tu%QZrV$M`EPC1=X1=J{tP zq#?PUs7ecYMaJflbqLSM$jHvlj@lFV;5hbw@{DxBE|lYsPl6eOn_8EbO%ZBOQf$c| z8A{`*zSZp~L?sNHfPW4so2if6r+bP}sQ>ekxeROkng3L8991hH)OE^+RB&Empt2YP z>eDrBwT4iY-P27#Q%^6lKsAezkB@9-X2#=W?{;-fO?*m91pIP&eXJnv{w+36&R`u* z1#9cuk*{90Q_A}KN+RyDC&;jo%joOVC@3f({1u|916G77si|cY6|t{exl&nQA6iz% zRa#nVY+-?=udlD8uirU3O8Vr<6JngTF13u|36E=9YHI#s?ps>o&lg&v0Xi^oas7@q z#qsg+LA`o#kPu#N^{s)owzk&T&JLf8i)(FT`rcVEAGczp^YWO!egD4n^Cy;rgM-V)_=ByIK7#1# z&6(RX32aPEd5l$WUcZh@Pmek~^R}|J%`<3p+xhb+FfC1A-c(FPq-cVqdV}hLjSV|$ zEteNQ2>+TIK2g^VO!zpczIn1?ggpHGKSXSz8H}ovrmDk&gV{K9L*F$tP)0;V0LGD4 zR>s%R0F>$G=C-rHFaP2B_eEIEojZ5dHif=TXk`TOgu$@AwIEss7YA>q@wvQmyjSKCl8))_R7;NP~i~}_$j6}FX4SeeD&&8;g^4I<>uw>{^%pq z(bHq3r$uL7kkV4l z>DgIEHa46$ERST2jp?EG9TRH)vF%sFX!DmOelP9 z;Ur8vcGEnDD~^N{N{K z9fH~mp6e?&@J8ehyxaz(`*{YE#g{=vatV&aAf zW3K~bd_hFB){r6g`uh6OMoGnR@neo!hlLyJ>gtpt&bP{+eheHR*W!HIKz!rIjYZ*@ zZMBKimKMptpr9(J6$Mlng%Q%pjJd(J7()_Ng%Ixb;*5e_PV~{d&)IiZ{P18PLLQALwJRdHk?k(jb2QQCO$rX zcw(aaXm@3#RF{uauNH@jipo2zh=Og#Rt-*kVIiB#`Y6`-Mt4O$J?e|oh0EpkAnc>1 zu$#mA53h>`a-;+VpoJ3Mj0wUfS{}($Ocr^;QvdvW&Z?ga?P?{(hB76* zLg9?$Jb1%GDdEZg^yyQ_t#2Y{@EniU@?xp?xbC`|+So?YHc(h4<1}7JYh(21pSFzc{0&gdiwkEdSmE1 zv-~cu7HLJ68#FE(U7lRel=2&Nw`x+`U?dR~6a-|Y2jQwyZGKhMeG8wCj_zrLBbwCt z9vKpe^oV{ne#gkjsH?l%RJhQH(B;dP>-q}9()++)XoyYj7~t~U13>!hS!g(%8)L}fga zV)ej)a*He@J3Tjd%*Vz?K3Q2=hy_tYQ#N*XQ+av$FB{Jtf>aLpEC+c25I(cC?Df4o zcW>!rV1NJaT`V zIEOJ@-EoOM#MH4}XaslYYbq<-;92q5&+~2xs~1S#pK*ZNgxZ_qzpJRM zsHUdYyq~`nIYwyKN3ZIV<&OWMCUT_m;@v`)8e7+&pZz2xBmt0*n8d|tEi5d&!}56u zax>3I^B=BFmeFu>aXI}Opn_sG{!q#$2T94IgPSwAq9|TmPw|d8ERnNgg4@>V0_lvI zI+CLL*sLe29S)z<+Asq?H9rnxeE4=8xy*0Xl{w+z{(p%PB$}XtrcPF-K zhd^Xi&6HD8!o5OtqjX2pv)HmCzxDEh#^>}vesi+S>Bnahi$}E;9p(*<)f_Iar)Ce! z@hx`8MwCB8(Z^Xs|K_zhoPz;L-su3T8+B94Fz5R3!pvq0TwL7Y0@a)RHsd&ukgC-4E`|-L>_Dw#!=D-rmT|LLC0Qy|NQB=I6tj&Sdbh@6N*iu!Iw4rU`?{xy8!bTIEZ{u+PB5Lrf#|ysN=!WjIF`8=po{x#*k&P*7c(&;^{(PSi8o z7{ASLJsQ027=Gm{5s@O?9E(bNI7EEjqclP9FfCS_$3wmsXD>Mon<(K#%EAGy43zWXEgPgO8;!IXwKk_T*hzS@h@6_ud=vNbm6}zF*`G78Tb>>53cvV;|NL76T8^5I!?{)02@3Vw_7S+tB z6$>Fa@kvmOp*RE?`5uSByOk_dSKv?ln>S2D?&LCWSvWcITQ5#ZlqznNRR6;+)jt1D z4aLGcOoM4X`m#-58`1onBmT*&1*e&yyd(E%7v$H+Un8<34})d0ydG1`vC%5BD+DDj z9R(8cSPnfRZ_SI|SYVR9=;lfm$(Be>rD|)d<(3(!9=VC<3+P()DU z&tb=Nl5GYRPj+r(ANH=O9Ysr_Q*<*`gbo#>E-Srkin)J8sTfs2zcH&R-uzeWmsnQ! zSzBhZ+u6G9i^YlZn~#4iEd{`7S=n13tD9Gx*+)7P6h}lwnFAcCo276~#{H0;?GMSg z-tW@)Wk-n-vylR*zB(CP;|&NIH+2ZPi?uhbR&Jk@$Rf=7ip$ zCnd%t>>+!rmfODAWcTM7g6|3K?;g9CtV<*s8o!!b=P?<(jg)J*9ZI@YgJoZvJp7Vf zY(Mwlys(-E>6$ED5+?=xw%fY{dUIcP?rFWlp;*>*N zFkZmvG^(3JMZX1{N)#*u~? zpJXqWRP01|m$M3Ny<gV1+ba|tCav%K%qux;q3R^OjN@GFJ@ocn z54~Ved+{UvrcCU7Y{hoDoj<0O1vdk~CdDrHn@Bp=^9R@8L#W(ANLe=7DIe_73Jy?W zIQx$LoP)rUjH`QlUl$iiqHas|Fx8|!#O)S3 zFr5!Gw3@w7Bzi*OxGqRhvRp&&)9-GPH~D*!aiDX3?MEbL*}4R}#r;Otn;L`o2hj`Z z_*zz$L^J{TBJNv@x(Sjcn~D=6VqzHnU&`@IUhgAyb#>XBo_)fPJ+PNGJ>{kR#fVAo zZ>A%BMi3CVhJzd`^ww=z>k;^tggY|bPFN93G5p_jDDh>8MGaBzZWP;7-`8QR5w8~W z4vRnRohJ|1vA*9*VzARBy^EWNoq;+(qL+`%u_Sl+{s9&H8R*nH;~wveSUrYvFHwI3@l6B?-|_VYHVg`VRh_Uk1B zo3#EkRRdGzS`d;r-@f&^WbMX{NR|p;&-CBelw{v<*kXF?zaJWw&?GUi^5JRpv`=>( zag8ZiTt!3&LFw*p#AJJI_!n-*O_jrv<|t?(uwCk9+OTUNIVIfXrTU@c3vn6 zG&(|U>)L^+TA%s#2sw5!v(g`52$3F?!8-v@F3(I}P*Cy+VopB2{hHBzF{{xLpE;M5 z$8mUcvRs6ucD8MWcv+dqrrPCKre0Lm5LPj4d?woce&18!ibKLdK z!k<&3Tko9sepN=Dojc)XRrdp7l2d%;3O@96Wp>v--Q!$E+ut9~Fl)2(u_{SPl%~bz znJjTM$+#R#DdoEz-V1PW%kW}gvP2u7^ca~=_-bWA(*xvy+S8|v>gZ7Xl*+8N`^=|C zM$|_H>Xn?homtB*x4MXHv==@YNhr%O#xY~unsPJsa(;=V-P_x53y@}{m}+`YT|hwh z+eQ*sDU0#+BYx=-y@EjTx0|dk9RtC4@BKlz-)K@cO>{>ic@dW5#o?8_ZY#^S*ybc7 zJfw7wicI>J=QZgEgwBSa+o+Or;teNsqKxg%>Q`zoNw2o8ugk`IO?iz=zYt=@(Y-Os zH?NFXf8%1_hgoSig8s|FMYrze1Z#p4L;U^h68*-PjRg4kZ$HfkD3uRAx3VfsEs&A- zGoF(~lRU4%jH+UfCA~G^5}@XMy^(2o*SU%wjnm$q{s*NYc~jvjdgQ0cO6yw)OHt}D zxyPqo`*9c1m;0k=@2qvix^9v^$#3(Weg4k8$zP|ZEt&jq$|a0&NA=P(v67#{|Jo`; z>pP{a3O_cjpk|L4^xHvgvxJj@?#+r^wa9gBu>eYK&2p`e!z|qycu8sq&7*@q#m@OH zpM159jvpWW8G9+O0SLhPobtZ}nF(GYAw?CHn2xyw?TZL|VN*fcCCLdN=gr&~M{KpD zuNlv$?AcoJET^llr3Y)6i~9zCh^;CKRk6h1aKM&UC%tIWab?lGJ^I8w%;Zh1V=kpX z8&#DhUYm%oKh>L0oiobY6L*rNpFa5+Uh@5_k7rsChe+6)-2{j0)n|W&UzGjAbE#(a zTZvFcJUo0-d9CSGV&hn)>qt;WT(XJAslteXvoT6Sy(mtACZNwp*>)nfnBy|7EVd3G znye_`J3r2<_1(+^LT78Cb;zAEp~{XnCOmX(jn_)QbHGQ>zSr1})u6H=SMm`+}Ro@&c7)4eoo=kf9{E)dQ&C1M1tFG@wwFt zmqSV~;zdX5P#Ru^L63no(;ohYmJ`44rq{T2?963a(1A#+@y1(%`-7aJY^%6#kFxwH z+*4Qb>K+c4q+*!3o)>2gj!m10IU`Edc8;99#ytyXcPo>BK&R>kBcd^sw z)hPCR3I%%Ji9E3TizL#uw0!t1yR6$K%09(w$gzHJ&49&IA~b3YdFmvg`;bQVJ=eI2 zg5dMg)uZj1#E%BagC&x+Vt33w7(V)`?egMI+MzV|FXP1m;;=?LF}a4&IlV<6hp{QpHH*Obv<} zSTGsCZ{e5N($W$h9sN@bKZN&BYS6-9slqEJsinBc$(pHBd|q3>fQvJo^pp4412#uW zcP_n34Wx?MBqZp7|M-*i@JXRvA^njOo?*!S?(?col!6Rrt8yrE#I;k%e{CN|&KUjWuMeBv0gAa<5 zPBi5k$i%`!kc7w;{`xHnsV;De0(WIMYm7?^xs*q^-B#Z#|^mWFJL>lIA!G54cl-; zsQCQ#sk@^QF3y0#qUr8ztP%6C;yjUG8>Sa$$2@|9J!nVPv(mqn)Xt(z!^dC7- zI6fuP(qgS>KVTlW4}K=OG@U+kBg;-aJWEkT`_uOE#fu3llOF+$;ul5;e9ZSpj*5gz z%aZb%XbAG9z4D;@Nk10X%E>-sBdj{z_kG00gs*%J^y^uWKjl5E6VDge@;|Ovx`Qp zeToM*MC%Lbxx+<3biDa$qos$@T`Ukqr z9=(;m?H9Z+_b0RubAG0m5F#w;0@d4D*d&|DS}(XT+IEnK2I00+*lAMh=-*dat4`PY z^AmL@O_1UXSa~v~#p|zrC3~d-^MPtv1Gs0qmD|{JD z-RU~WSUOjX+!rG++4jonM}vPQm-GQY!DI1-d>~VT1~_!@FilJLMoF(nQPI)6z|$?m z-BOCVl`7qd*RLsF=Ujd2T9JGux`UpE_qE$rcXV%9<*5cy6w(1L=Ny}I-64Fr!9aLt zkucZoyX=BG8V^J^1dbkvDP3&=7$ar+Lf(hp-Mz#98jdgap0NlC3DJtWp!lN>)-3N1 zcVSnp@E2FFGlUX9605B^l1;KL|KuEcL3u1q_L11zroemz4_uRkhTG+=mj!E zr@@h7VX29RY(>83+F89FzT|u=p>z!oXU06$PrllIA#DQ@iK7_Y5VFP?dlI4%2dz6z zO~ZrjxeNFA_^z*fOQd7FaJ=rE3fuUjpYzAEiHch{a`Xym2CAq1zcNw=+D_j zxE>oIzkPfIO}U`uAZ^HpbIBfk9QGj&?TbZB<+BGM&;yx@`1l6eE-v%B(BfgZhbUgnVz^3 z_yfN9!i%92YGiyfH>e4nU%wK7_he^xHy|tPW+JC2KY_1eUJ-!-DTC zppNqAf4yGO3=)a<%c!d21=cIfq2>4rN?AGC-zRLcp7`BF{CJC$@%Gn<} zjdS%$5lo}OEU6*TDUwrCsOB1+nBMeh?G+QN7Xw)-=(H@?<3x*xhv!w1^6bZBEv>G| z$hU6W>id?(>cODr9&GzvrdiwC);$0IsR|S#AlSuc8{O~J)Ym_&udnB1WDMMPd@J9w zjr>w-nG^;)05=i%OJ>uo|&6>ef#!p8a8xoyd{8xjm;SDpP7$`#~5P%8Ax?Ni&|P0 z;)zYN=BR~J@jY9FZ3I=iE$pUmIA||SqN1tW!{LDua*QbIa&T}^r_SaY1^3f%yzAWK z))Muha=$lbuI0nVPs5#n^o0&ACvZ{N-#@u(rs+D~{|Z{6pxv}TY^GV?Yi5){hSGk4 zTbET-Oz3~Uku*C|!=aZZXkY2RZ3w#8J#Fpl08V5er|+(2`Te1zE@FZ+6ip+9Qgg=& zRC|E|b~@ZLKuH`RI)f@VZ~e{M`_U)2-;vYPI$d2|i`}oT@BIBMdwTG@3UXw)9Air- zlf&-LlI%c+*mXie!el{vYLMfcP7my%LMq(G_&AQm>~*c#>rg>P1|0;|;UY~|CEbi#(x$=sFs{;;HrO&CGxYq&xP2VF1J9~RxlMc*u@t33; z>t3q2I_`vbL^n;d>ml?eJzz2{ky4 z@NY`ec7_otdU}dNSqp+Rzy#VW2vI1VXlJQ&X=Mc?z+WZ#$73Cxa3;kVV+c24r)5k` zbhMMh?O13Zvi&YE2&&b7+LKpFHa_huFGBwQ&1*Z!Igl!VhsyCFf}tdmEZ>u>iHV6j zhlh%=Hi+L6iUqpI-SE&IL2aOw@Fe%UIA*;?dez(8duL2l$^-;Z9$wxhF}b>)X6g*h zZIw)M;Pqp#__*$EOq2-wo(Zd@3kL)TW4=vH?C9*o_*Z0>0a{;RD|}B7JOd~p8ugkt ztg4J4k)m!2A79SVkq3k)jE4^&wse*X!H!@ds30jgH?5-7E`M6oFc$y%bs&fn@VZHg;AIsV?aW}wWj9grQg4!;YfB44H1QgI+P*(qlQc}W^%5R-Z+P8~xE`Tf#rib?5a~~k&e@FUB`CZVq^*@jk1;8Kw zEko2b5E|dsfRec7<)nwtZm$Pryf#RHr@9QGnLt`_)~cE*UNk+})`pmzoUE*?3xO4( zj2@Pj+Pe-ALzFZ$vJW5Pd3t&dPfdlXv6VKo<(HJa8XeWRpRYkqPL4o;#z~l>d<72j z>(`{@*RL}HV-6eizs)Bc7w51{NLq+H+Zw_mA~_&1JHB{PvbQw;?g5!&I_@BQQOb{p z2Y>Vq;^K%yx~|!BEYTln2cZj1BXt;eR})up^_R2~q{^aTOJNulU94 zU%SQ3ob&SUFCHNwzXg^b58askR_R^UbaZg4bfRE2kRIU5O&+`GAc=K>qPq+5S{jTb z<9k0v?cTqCPmF`f!^7iW{+z&8mSo=Xznyhy;klD-8vP(EvT81${Ol0E~&9zf%-miyTbaZ@tyX0ZCCHxEI5_*UNnViZ|Td)Zf z9evQ@>;d6Df$oO--C|3JHsP4gWL`4>YYY^xY`qa}pW~Tc#w3mleHxl?i>BnwWz4Z1 zt+m#K6Qurtgrr5STF5aoGZRjde00U;+L|$FR!ULfZ#CheGdEtLq>xv#{hth5y0 z#Kc4?CLD2Rb0$G|j9a^!h?+m#W_AK~2_WX?Ju+8>YwhPK%Jk}%8Q3+h!)@DuCXWRl z-{7)NN@;d&$ky7LL0mk&_w#*wa84H0LD3X?-umDcDdf}ig7EO_+gzw^1YMHI!d0;t zm3htY>ULk^VWOjK7+^42;$_cORx-E+nO#BmdptYH`-_{K`c5vZ!Z9jK0{9s{wsD|7 zYMoqya*@f%;JrA#scm;ZC^0Ds+t$_=P8KE>))FZBpahSAO$&{!Y+N8R21(sDFn|v} zOyek%@j`X)u)JNcN50h?yNTT058X4V z54OJ3L{kfRZVG~gasSbyZh#~xx5SX-DL(j79Npb}v2S|&LsN$;=aJrC?^<|8f}iOJ+m@G?f2^yReQ)te`B>9vIA&Tnw@>m+R2I?VFwcPAO5POjhV+fC^epd8WY+d+Z>ngA*<^Y|C!eA(D0Ubui*ccgfyeMZC1*eWbz|j0|*^QX;o|0@R zNycE@**Q2+c=n8i@P>Feymjf-otAshG9MEXAr@Cwg1GeS>48$hqu^`JsoksMcw+ z+~5^(981vBd569K16EvFW>(ftD9m6e==_qv5f4T<-}94Qx+i5Ik}ZMLr}t}`&`6D? zDn8X+OnAuhpe8Tm6mb??PVq$Xq5vm6s-Rb|@Ib~3PzBXH8dt#8#LO%xJRApwcmSna zEwo5LQS5csu~8)f@zWr>0`Ie*Kvnx~4h}Q8PqW$CS-Y8P3uTr!yL7K*EG#W+T6|9L zN=Qh6yGXL7Q(eUMV_H7qOCz-^J6VJrxZ{aY4FLe66<8m3uM^lg)WB^a?77Fr@+dVJ zC0zrrxE;q&T{bj`5g``{zkvd?1elk=^O-gTG79IQaHFUzI~+V7H>6HW=jeT~P7#5Q z9R&b(WxPm}Ky<|DgK;}L`L$~%pz2pvS8J5KiY34zNP1iiP}g*>-X3*HAg2f1g_Y`D z)-}w{&5ifgMo_jHz;#E25AxI`8cDx5y(9wzSQlJ=_p8vt4jthgIYtsuHzW?go35Uo z;@<|?@@oHNKvJwLR~V(FZo*R&7+fj8it?$&(21GCGfozEVq93V!aZiGfCkWZ8yY29 za27tPFyyk#?RRvy&|Y(<6^;o?{L7jC@^nLmy%%;BMf8Ey$=b@EYvcrJ^8;nzS!@ql z+T7%@eE5&)qv(fd%dW;kO%Td21~~;l{<0W(T{c())+HNBpEkM?tPW-M7G(K;ac5)i zoSmhH=iUnm<(N?3jLnsO{a-V)2H1P9K*fMF{AOEC>q}PznbZEpW3VK~Z>#a6L;E0Y zXUBO3pK^Fn;d5VqZ!b0gJz<{{{)HSXOi`%8jPW>0cZ?JOQ*9Puhvw2W@-!OhPg8Bm zObuVzYGzsK4YmW1wsLy(TZsi@nab-=e=BDfI9$PwHC8_6y^%6HRnPINN3mYUv4R`% z#0O{Ztnyb2e&jM7A8_S`)G691^cfDpd8em%wP|0-(|<{a_lqU_yEbcFVp?qP>SS(~ z)y-#D_IoU=j@eo(^gK-o(G@EcpARd^EeuLaPckD@ZK97pw&NsUkL{SEnOG0w`goV8 z$4~+{zNwZ&IpqCXy@vpc*zK*M_kb_K0u$OXs|lrezU}SMsEvii<2%3m4XqfCHYael zrzt~4Pb#0&ME=zOm6NweJvpfRYV(?+J)tMst4iM|s>0Np_j=z^c^8~T37By@{$0H` zeOdc{i9$nsUV=(C@mA+uZ;pe(zkxaED<7(>i;t+RsiCLo7kHXzGqX_U{pX6N=9s6o z(f{zN-}=X={&(b43gQz+J)_yZiRS&KjV;U^wrfexAMi{n(F|Yh2vwA>FtlV7Zp4&j z*_O{)a8AEiFrL1Bv||i;q^U`~TW}VE3Z%B_X-WhNo&ixG@?nUIy0dV@RjQn1fX2z6 z^ZLxkyV)rS&TaI1pAYzHF}LXVeFbrqnBx*|4q(+2N6d>?KipC@?HImXR00*r_)ytn z?Y6i#V9++$t6c&io?6L9YPL)M@`@B~-3T0qAGiB^YrL=Ea2ne)ApS4PnZHn4c<*&t z$)5J=d!?e4SA5DgR#jx4KQBMytBljj*`L`=0Spb01yb5(zK^a7>EQQ7El-tHH{7bj zcf-~+!SjzW|34(HOzEdAzDy$wEkR$GIxGLY$CLbChjH@*=VdB=$9&EtrE8^@$?gK2 za_GHrZDYryCM_#0pFMj9^{L{++(Y#5%63`2*o}8Aq;GM{t8DmsOZrk>kk3ro&)M=~_Aw{4 zbdFG~Je#;4u&YmE>ZICzd3$>RK&^Dy1Krt3ZspaVDMDnTk1WG3p7ygGI$9*2=qdNN zMSeV1HMPAMFddybcx|5kY$0~DPj8m;u6+(6U$v8&M6cga{lIwv*YCmp>HQZkQqm^- zFDnk(scPb}mB|0703q-dQ3<_!NNvIMw-VTN2@UMhPoL6y9v~wgCUNIL)B}vU`)epG z7Fw7U=qIUlr&Km;gzowJ`l34P$qJ(($RM(}NKK&DPzpUyS6~s%KHHxu$~3qNdje*u zO6xI}Zy#TTZ98gpqzc$Ve1(JC@DE+29@t04%7)5sX%vxzUl*FEiKF~@=q46_{R#y} z#28o$xH<)ZYq5+TjtKDl0ip?qOl1bw@($~y0lQiZ9nd~_-Ul<5@KSXseG#6AThXZA zAM8rh`e(c6hYL*L+iP_D9T*=^0=bqP1_!G1A10w%MwI^->gzF~Vv*bSJc=7JJO1-? zZM=vKUPvn%t$i0r&b#rW|%AJR3izA`= zZaG@UQ%)Bq0p!*VG_SQy`4nm#sO!rY+^VXoweO8WC?7LauKTxMyKfg>xgq6?@8{=N zYS>JTGU$Qp@%G5id;3TNf3Opy?glFU63XQYr6%<9eA~}PpO*1WABw|3@dK#R>UF?L zK=K3ukpCXE@lHz}Sg@w{sj2I*!VZ8y$Al#+U$pZx#9!W(JXvFU_wJoqZV53wKL%)= zQ7t~|G9cmBT8*HC7rh%ulq165WzfM7H+#8bVq&&UPEtUtj^h7H#+?>^e1{Ff0*to= z$YHwD1Z87f0_4-yb`N-fq5=1lz(90h+R*^40bhV=h7Wfx?1rQSD(#3cRx3zl}8?^Uy6plsdGA9WXccW)?5suwQ8YyO!SOzoyXumBwE zG4eYP0ovl|^pr#3hK7*VP`SeM`KAbP0-9c4oTI=lxJH*&SG9Mo#@9i{hyTJ?#ZxlfJ z)ltNOD?u$>3&1|`LL=1{4{NQ*uGr7kN<*_?Y-_7OwC&&YP(y|@muWFC z@N}Ed*3bxqVSpd2tFnyoi@-#oy!PNKLjZWAq@+YK>OlWZPE(nU2SD@ulUhqwD=Rbe z2AuBDXJ1p%5L@%jcnFjSH_L?N#(#CWBg8IQ|IWF>SdGDu%JnszrHH$>GiZA7>pF>#h@zl|l8Xz+Pz{=`=Cy^dFHIp)Mw_! zqQ-LwP+I86*AjQ{l7b56jCBv^E->;`5x3Ium0Ci&z~#*^zv-EnHef8G?if520HWo=nEl-c}xWOX)8wm!U0Q`@Twm* zW%^PuuI~U8VqjKQR`sWG51?|Ut1a-LugWefGuUm4iBZWX3INUqh|lO);s{c82`>y5 zVjLcuaWdATE* zzWKJ7Fg`jmQq?o%>f*xl{F|^NZ9!8u4&oim+L|G!tHc!)Y+h~5 zJSYnZ3zKCdf8wda7;g-RT3lZ4?IRe8qoa!k!#P`tm_|2*5JQ3z$3@nPFp70kQBe_B zkbnE2=#|{`^fc-fA;6#k`AARPbe4b@A`=`I_LugWI*VxZ6g~K|iyFFnC_90Cei;+a zM%HWCYAQ=zo1RV!2U~{aP5Z*a%>tE-WwBe)PoT}cXPsZJ?qWVg|1aZ-ZH5o-;$<<~ zXiJz=gt4)YG02~FNa&<+@xq&K!Z1Li`!*x^wIOA90tJg2oq1PKfMUt8u&^MPdzncA zN1U9T+y)02x(nb~6v4wh0#<)GPNC&Bh2bz8!S66n`|H=Q?K!!(Px}lnGVZ`Es81UG z_5)6e6)1j3;4>zM78j5ZP;jz9arO0Gg|mIF&(_QgLsL_8ZEekeMcD3J0||ipfd9Xp zxU`PW^r;|;{65kKay4de?9E;fAW$UlkDou=VLS^s<1SFb{h`z9{QmtsqaYN16P%y_ zQ~yKXmoG9f4+SbqEM!qsK}C(CH8zS#9xY;^+C&)jQTs8+v>*MSndnKIIq=0(T))n$ z>Q220A87|!cA%pp@G2%cc)x|}oqsC9CcLku1SR5CY^)_T*5;r%;UoU;?U_RfH-jgv z^h#C}?Y~ER$~!;<;DIm%p27s!d((F!~IZx3tTn5eNW1WgGB z35=oh)A{Jk48v!rwBFKLEC4a)eQ~_n@bWJU3Vfjy?X}^Y&qG72V*ClPq12(gq1QN` zWb1!;jvAAJPIae`HLFQI=RLQbogFc-{3xoOmzM-^S}(|EM}!Z5-_Ff6C@iob>qZ zxQ*z0^lhqA=a^z*V#*n!*I-_eUyKknDS1OVQ=AG0A5pU>)GoQL&)WmBV92H8-GetU z$C&@*`b`>KYVHzq6u5yt)Dbw4=(G|a5FtN*{@ey+l+*Dai+_gBUMu8yx-ox(M+fb9 zI1FN}q&c)KiP0JY!Hmi`5QJctc|6xh7@3XqbW?I;#sx9rUjw~udanW5(~r(rs9qiz zBp%QbVW@%p4@`e<)=bT#syZMXKv{Z&a!E^DU&@DnLPEvnTGbW)doYoq6$5JJd*zfs zP`<{7XZ3RTXC^)ajU5eGxpQhtdoU5xF(o5hqtDfQCBMZI8_Ix;v@{xoRu_y4NCSO5 z=HtfP0I3yq^58+eg+@EHu1)|o5C$F1U%JR1wGFLN;Ih7KsN~zt)e}HJvNlu0x{y=& z76jg9P-Gflx(?NH12PK)Fb~6p+3nGBZ<1nTi6A&oo6#aWg5yGMGz5r)+o&Z9yfl-V zMELSTZYFN6^_6^+A>}y)-@r@}3-0>nmK0%{+g>g0!NzwQW>fB|F5na1XixrQCVn@+ z!d{yqI@T^^(a-=-e;_yh>5^VIXMzeY%}Fser)H9PDMm5y=w4%m z3O$z)Fnne5QX`TB_#4qfrhnCajq3KgXfb09W4~o5gtue_1(rUX7MP&keGx>$bLU1# zg;4Prk-jqwtu%X*GrqLePR4ySV!vyn?56Q8>k$BY$CVy}kh%gD7=H=4os~cFQ zY*O>E%+6E^$$S+rzU!XJcR}+04+FA_>jI@2H55}EF|)LL=oBXBrJ0WWlL0Xg_>NSL zscd_67Jm(KR9;#?>g0Oo(d2bEl=DHT#nBUF#wwO9R*wU;`n!Ci{9no~lWZF}C#qZZ zkADz=K(RmC($<7`Cho#6!D=^;*PI@5Un#n$sC6~Zl@HAYH9GEJ?6vtH8}9el#-#~@ z*<`^4u`t6nsOIQ)HB@>2l}zhT;q%t^O3j_6b|8>nzI+*JUyFtcB!5uJ5!scM#L#x) zI%M(jhpJC(`aWe!7FtH=a#z2wxW|X4Gmm9=`kYFT?!mo{b35|4O7t93$#`}Qq|GnQ zlfK1v1?OD0C$QXHYTHWL8P@bTVq;_ThrL>+d2S4cP(ba%`l%6LMKvkO{AbO?Sj%I4 zVx6{G8N~guO1E$+mQtmF^V;wC`#jL9q5^ntZ&j=l(jgoKNn-DWdPfCW1n9UL6P)Nx z3}plZ0eWm{%upu-r-l#E8>lrPhE6^_txms{-pEz7m(Ks2+;sav&X^!ha~cjp{jTW{ zh5w5)1RslQSWU`&iII3vgYua|uB7-xvGHlS<~m_)cP(M6}njMO;MIitpKwPSX-PyN_1o)2@MI;;|T~wac*zoyj@c zeG$>cfs{*27(&c#n3eMPv!r?WCYN8Ghf&(1txi}P+Bwe96N@Qwsk!{?KhlgP@a?)yv83qTwgEKE&Qc_Owku@*e zS0xX71eWX{6kmDy$UpxJbTKMW#w=X;ufD9D55Y0qZ@N0=O(^1K!jUq&5Lv6SN|G6h zR^^r^BWUoM`L)nN@xVn|>8T5%ig)yuce%e6mF^ofvZ?rt_RAZuwrfMbwa_Y;zO1+~ z+6)?Dx$t}2|3euFY*j=YIQ>#+RcAhX5sK<9vty&^Kz}q0$G^WGfiWcdwfaJ z3;;A#Vs0R2kkw4(?CL5HW0bpd4y{#mksW}JIw6zntQTYzo9ANuG0MuAtm#MV~8P4W{?Gu_;MY>?2)#+(hE<-&&ik;{Y{N%h&oHgbxg&n{9pj zs8eN%HMhiurCU0r(zxr~@zWV)B|r%aaO6$JXeWtfctCiswVNR|^4Y&Ow^T^3H^i;D zwY7yR%+P=2S9jk8bRi3ap&cC^+4=e1Zo=##u(BCI48thT)1N-p&tJa4XMGfPTtraZ zF^sCSo$3IY;J2d|;qRXQ6sTF?a>>chM-6lXoT#RI{Zvjt!L%dzYOe)!A? z7BRfN{r%n6%X1x1tP#g4cM;|5LEl!QaZhnY@7sDIGPyXRRF zK9);JNI(tWqR8pRMdLFV8g;QL7aE4~t#D}JQN8ij79lKvQ@0uuUc0|thzW8ZiXDOS zzkImhhk`&>xkBh=mJV0=`S~3e+t9#r0En4N*mZ;4_u|+xpNlYFDIet!0df}`}L(T8PKs0bAX7HUj;Cqeh@O}?yS?2zIoKNoCF)-5=KpY6cnJ(&z3yqRp znBqjB+sZZatRY1;HDZ*51txp)s;f!iHc_rJAkBiHKm#dqLUcMBH?Wuu5$xQ(GcMWhq zLCQtZ7NE$WtOzhSh(G`Zf$GPiZb+o6P|V;!1Q$z#NFN4*FG`vLq5?82B5NoNYBy?p z7=%Jp(=a|x4&2f3*ce7qKalZUZ9kpWq4WCl#~HrMUASg1j zvKF)p3_s<*4%dWl1_(j5{zxRa1;Su>ykmO0A!5kCSQRuuluqH}a~>-pxAMOjd+&Ix z`~PqF2q8O4vPVXgkQv#lq!R6Hkx{nnk*(}P%C3lz5!r-LSrwA3mNJS&X0GSwJg@Kl z{ayEU|8YMaXXiuWIF8SHzhAHCdL`hY6|2Vw!hN2N1IL|4C_L(yzjA=aAc7|}G{~!~ ztC5FbG`Q39X3(C=K@OR`uoJ%OS;WQ!M1!1!Hzy+zrefe5sC>7J;3S0#PD;!aeMMWom3}cEpr* zxWRdqyzyBm?OFSb9~(da;)1$g5wW?CepX8nrj88W^qLy?d7(@WIS;!<$i=9mEj{5eHWlow*87WESY+ zfI{CK0Gq9xoEIkkf+jsv=k1=RlDqn5{gzjNSKRqhM*v!Ad)$wv6!y=BpID8DlD^a27O2vMkSG&n)=JGXy>QQ z=C$j2d9~3$r!Q72?eAObm{Ioe_O7l?%@)aLE$299AewD(+M`-oQOi$h$Btx5N3r0A zLED2hzXU|;Si+V^jLotODP{Bp+}P!}-?5zM^>f|t%w5E_#?atfNloH48qgRjGD)dd zpggu~qx=RH^LIGsF4D*Kir|d zV;hlTw$F1Bs%-gV^8u8_9t6{X8=1xAk9k8u%f?M69%k}ex3zRl|V!1f0`W}M9 zd%reK4MkH=_OYjuxwCof41J=wBP)5=&htAQT4yE;WXZGIq>gY|(2|(SB{W9AH&_^& z>3vvtKgj>fzUJKgSp7HQBks1em+MJ|cTayjF}~BcKQ(^fyMdHuqH%}wZ{xA|Cy#$p zxY8EHcBv`!+o^3`*7cLG3h0MJr&V&*E49<}oAVPDo?fhUnm74ULp3qjlvU&XX7jO* zTNfPbb|S|GCVvzCdfFGnUp~F zZgMK@kjjPNzhhM-QpwFgrY)~!7lKKo^%sT9Im4cU!hYLk??T6P*=>Id)$_d$-Z*8 z)tk@@{9cyDFQywhs*h1I8FI;x{aE>+6Pk6Cc_V8a2;ODo7B{U%Lf^a@#vu3=r6O=YI&!3sLFGyOI@YBsa zm$St75B`WM+;Ahq3(dGoM1%#4{-U)wFK_0^{Yd#xP5;_EFzs>dg~ zy_C42U?c+-p`fi*qIRVNa7PSy)fWW{lQR1H`gB|ew)y(cxW3Zc#awNj5tRw8GcR7T zYv1>`eSJ``?VX4ak?pxsO9Ko%EGem9v3q{m_XfKR&E%xpFEJY2fqAL>Y;qoq1VyoW+TF72!SA?M2M6YL>IMpc4}R@BK~WB zJ|1{GG-sJW$^Hh{RhTth^R6^!!;w(-F|x2EP6lvraO5D{<8gU;2=vB&hR4au(~Z@j zRwE(^pmWp!ZXd0qtxXwty2IwuC2CYw2xP(z%Le=Y^4e=h#ODMJ4o+&wv{dC?&u);DteLpPE45 zVCOnhW=${%=IDZ_dQNGfAvm4Z;^_11^A_T@A(H`M=rCH(g*u5MyqOS9BG3uE4h174 zBM7Aurn(C!VR)U;FEPL zRe+~Js2DYDBBCRJA8Hc6^5sh?%EYGO;k2W%eVUg_U+Q{!DICpEj#k-NV?h>>uAZI- zN-d&L1|iZ6p$@mtOfGh+{! zJ1X2f6&hC2dz8!M?9AI3&!#YRUNUb(8dL&^Ar>4SSsSrE{T8&{Z-TggmOx++oaZ^)bRQu>>?VSm=IGN9i2hQ|QC0b(6Yd4XFa{eWO5 z(eMm1l%4(lp=v*<#vtORKX8B@p(3pW_y$P|kvIk3$FAp9)myRXNy~ioy{qVY2)!>M z970lC3+_fFOA$z2g@)lyP6jF<)DVL<;}t@1)rd4ak~@hZbH0&4Gmh!%211blh!{^yC@OOlo;eU%IfQF>1OhE2a zKcNzQTvpZ$76CywjUJaC#ZU2X)d-`DKke!+DlYyU+&S^t5XDB&d$^)T-#2k|Zf}+# zr2N2Qc|gLD6Jatq+Q-mTRcKg-9(Mn9?5_MKTtW!uK7zG;j)Uw48C%huoyr#8tcI=> zJ$5j`UZa_XWIFJL`wYjeX>bL_MYgA{APr_y_LUE#=b!{{`Jo|z2j-BRH&PGTu>Guo z$9U>APDW(SKwe2u&lV*qw>)9XV88H&n}=s~DV8_7%2{|~HH1eHu8MC{Q;7^*2jXz| zR5WL8L$aL^zyN+Y9`Av!gnBD<2Bs9Zrw+p=^}dm~F3&(dqp?#1D+xtji^rLp*T&{N zi6|jvW@eCG=8lfB(0x@j+nwl?8ss_X9RhIrBgCWfE}yB1hPOI>=SeX(Sbwse`DscRnghP?s}YQtN|8{ftv`}6>D{B`N4||znYvMvGsGUv!*>; z|BxK_(jwrn8Lm&n_3O>J*1a`X9U+=7;Yv6nb7x@?3;uO*-h|1ExU6}blw1T4gdQq< zdVH#3XYBnamychw6iedkxF}Y1;oNmK@82;kcZlW~k_N6YyPx+Xw>Gp1Sj1Dkr~dZyp)4$<@>hRw!lprU_3BlC-8k!3>fZ-! zOe-9ALO2)-X*Zs+@V+xQ$;q>-*@x6bl%OpqAQ!^Am7IJ!(Wtk>$Is(_k&P)5*~XpZ zt*{PD9V~K5_@`DQ@p?`*U5Xaad0{`OwN0;HdZK^bH41B1BDOqcV`Jlln>Tm8v3%pb zNJ#^4L$($aQS07taYO=VB2jUDH3j{@OQ_ez|^_aSO(zBtyOZPeb98zOdgb`cQ~ zflyi2*4EB`>lLWbSh;lg_O!FJ^V07($L9J@n~uNS*I=c{nSYywngW0V6CfE_Ck*SX zl1dG#tpf0j0uUj`RmA<8I^!g=Ju5etXxrgWGK0v*A$;E8W`4Rc*h{GVunuZox+JpR zpO#SIG4tLWvH;>WD;x%&dC2n|dDw4bT6u*F3N$EI8M(O9o>TSh>6L4IK%~LIGG#vg zEEELm@q90)n-4fLQc@zmyiEQCyi5z%e2_uMS(3fyt(mGOe*|^l3yw4WS}a%2OC6tL zYRkmrxnskDYY2b^0!}lf!xIiavjwV^kmm8YrKKey#{vb<+vU-| zlkdVV&3kD!=ZWon@|WLBeErYF7vXCW-19%yJyoOWMv`>yx$~rYH=~V?1h3!m@}>Q+ z3f^FEkE*@XOC_SBqCJR|11EUT;Ai$WL3=?#K_Y_^3pzlqxA!i0SeWaC{v5oF{Q_0j zQp3ioN`*#N9y{VA;LE!KzZX5?FCD@S_k|uM_E}Kpl%PG;ur$152+tmoL^K#y4njrh z5Po7`Xpv3qjT?a9D2XRo)4;&!6P{3X^5oEJ+W%T+XV=S`8q|Qt7Q7+&o|4Gy?7Zi9 zAX+)Brw9K_lZVL9`;Uj0h~!OC(J0oc<22nU)UeVv0V*0~_)@p6B7Gg *F6v$T+ zKE1s5x$BS9@lUGnY{-c-v6xiVPh-RWtrFJ)B8TVoZ7Ox zqwdh+(J6K5(6si6fd1Oz(KwH7z0ksLiER@IXXLputNg3T=JY!&IaygD6cP5%BG@0U zDcydPCG&k`_@+~nz$ses_^1Q2w^;A;zj+mU?@cZ3JVnvC?HCw1FhuvJNzeP{HWNBwsmn|&4c+2rIlTiO(feeIf}znD&$ z==f(RD_ubgk6yjbk!aV#RZg*w6_x{gx4ft02;cPHaZR<5>iNoGP$GBO=*FMypcxU_ z9Xoaqu;+}#u1jkZ$mHO8{fK%xCjZNPstALk z$1vp^d+ORF+kOqKZ*r4xx}=h)@;>%*?)447>MqwuEolwi0_R`oB%2uX7c_ zvzf-3Z-rfx-d3k+0d)f$k-R)AB2NzOwmRqoX8p#E8|YQ~S`7R+6Ry(LY?L-GkrZd2 z%FZ&ng|Tr-V)Rs)kpk`~l@x1-=YsC)TM5o^B z!Yxl7m5OD3d7|#MzLUE^9Z_u2xt4S3;2$Pt*K~o2k5clqrj<$CgAUbEnOTQy%c*P? zsQI>;yX|sl_e;N&kCQR?OEyfFMUU=l%$#c&zdb>zIUEr6|H6qlX0HESzZPHg>9J%9 zIqTi0;Wqn9;~%?nzL4vF-tXhSFQ7%zmYgMK+c8IBfzvC0T`6|AUe7+vM0PH^`#)L$ zZSr&MKl=ywv!Cm+BH0@7nbf+=sXQtADj$?a|xexik166?eUXoOtUS;i@<$%J+NANB0*}F#fpi|I8@4qUS4Z z&kHX>@$KnNHig&cCQ1jX{oaw~$o_bK`Uh)|fnjj^{T-}7&(S;n&|j0-u=!9}Z?~Vx zdr!0DJY{BG1?d{A^*6=!(+)Haku?oPI+}l{UJCDS?EPZ;shG@?V(lq?*~)FT^O=-> z1x;tx<~7f3>rXSjQNeJ|X{YYIi;1wgMt6zqy{^7XBbRmNPS#nQ)_NLrCrux3_g;iH zDb%?4{;v$bcYoSh)HWoS{)BdxxAbgLmar0K;;Ck=%ukHXtbN2uLFeh~KOEUu7Oe5- z9BcYEDsxvtmF7l#f`v|m!s-ONbm75O^(=K~q9 zvr)+uW92=eU5dpml|u%;06wuiV_RBqq+1>+lM@{T!nT#2{e2$E&hYHnE!@Yh?6>v{ zq`AN_96NMuvHA4Rg^8JgSXSklc(Ki*iGM5|)%$XfP^!E0u`wA%wYnESd=9K_;Mb|+ z@v*f#PQ)eaY|T7hb}-C7W}l^&>cs;mUYiO^diitJ1h-{o6z=~0c6rQ&O@eL*>%0XY z|3wioA1<|JL#4e**Lr(`i)LrelxG&M*H5_$ii&O0&ST&+995v{>3z?6;TPxG!JiC? z@t>{uPQOqS)6@Ib@aFT-jEFKrI63|+IrAZNW}@>>IJ_S+m(D|)f)?R{SC7}+FwqH>tUOnT9j#~u6m=SbP!)|z3b;D>p-z1@)o;{S0@Vhz2Bu<<#~&?{P{Jz#rO z$mJeq`J@%C<hP+3HCke9y1!THDjoG?%CVZ1w6dL3g12_P4h5MLmo~|`UO1tc}T9rz3!t4yJt}O_zJRdY;b3v zCrX~JbHjI*f4uYDnf__@y@6H1sX~4re~t;3FZl}z`&5i-W59mbq$69th)sO2^ju>6nA6;G(E^vKR%%$alL{QjpOSrVtww0zE z>+91)DrWdC`x@i@iw&fBXAX0(kHVaWJ>HAekxtfA40-!vo)7+9(I}3I{T;2+7WO13 z;@*kfKp1|hwVp>|=lIs>HP71~s-vMba553@HqdBp`f`WIjgc#k?yd=#N)K>9${B&! zhwnx+pBf%wA@al92JW5uklK8%fSXgCB6Y8Mzn{c|%Qv(aeYA~U zAcE*X+Xc%thN_dSIbO)KwO`JMDdg#L&ZNn=ebK-x;!%5j{ReSUc_ET zO?h9QhpEmFOCJT^Go}$N<8-GEHxNjxSJH{HA51Q+n*7G;%F{hGbO!=LzghoI4~FX4 z?MzIn$acRcLGHkwU13$s@4dcXyE}u^Ybs272!R+1UzI-q3Zrh8*d->wKUns7rwb_=cRLfmut$clL{a5G2 zR7|0gw~%ey7v{3F6CIfFi& zuxcVeMB3}TzLuzWP}|(9M8MbXr)@tYN~TzwKSeK#D*G25uz7l5i@op8y#9dw+zsNo zyA|gZ-p@r#jC-Cu5xI5szJlk;p@P+qs^1%Ca3a_rfLSM!DhL2*-@fA51C)sN359dw z@6V0DjQ0JHhyeegap^P5y`)PGM@5TL#b4bGe!PB&(RXguV2#}@NbdRs@wR)HHVmR} zWE{+F>TPQGJ7(bTFZC$QrxE`>NJo%i5OWd3}7Y zTTE5hGPDqLV@>2l(u70!YqP`g(~SjJ)nF$@;xRXWyvqgJkmf&GPrVZ&X=>YRW5_e6 z##-*BKjy05GEh*Q7oAoRL;EKFuhSb5?YH%o?S<7*^%2khZrC=zxE4IILEHQ=f9T@p zo&0~&H7cD2dR7WglhQ1U{1GkHNFnx}$5{>9J)S19$>62oC`jjLVai zUmB+0I9tA>;fRE0S4LAxwdD5&zi--IC#eD@f2#$armbD`QHilSyoqVs+Rm`~gREw& zJH(Is%6vG48N}ak1w!bJCEr1YSYc@!=7bI;q z&8U-wae3+E^?eeyuXhP@%WZfZkaH=j9&l44mw2UnV`+W-X`(Y7)s@BYjku(-)R@4A zJ3=PujLkgXcgt^EWXhoMRIN#}B;H_grgHQn9%y;bSzai}n+e$DkbkGlWzgVIeiN=( z^aw-azM0GyV(KXW-g*vezwmBSJQAJZz7=YWggbY{%09hNFATQ3$A9AHE#0F<+(%z+rW?N#EkHHGX=KQ)Z(Z?)IcO?& z*Ls?yOsmzX6JJnIcu4(0SQeOcv;GS@Ucux*x{^qXy0BLi|-p8k=?rm_sz3({OzvZs-H!_yk^unUi$O0 zjv5!n9n5+vm0EQ$-{$@U|ITOpp?}|x9HD>ey3XA{5<2PQZ{5;DT7)VzC66SwsFVoi zeyw#m$rz@wl)Cp&jW~IGO0mD7Qd;(IT5n%{Azs+ z&?>jDpI0WC9d$`kH+1ccm~c`UNI0(TywP-xlN z*^!@gDJhaczFVSaTQ~RJ1Lr6SCk7#T2J@f*HBxVp zv0~MeeGtV|?cHc$Eje=}Y43S$P{7U}9zq*y^Nd4$274g#u>!Mq56I#@(2<|9NH~NW z?UNq(l3^9u_GuhC=O!S*;1>zeID*MVJ}2}Tz0uRx4^2+q0bv;(MixL*1P&~5=JDt3 zL1CZm;0A~x1^{pgSr{bGVL*e*xL5^+!2)7|4@yhp0ypW5N@Q(u$g!utsE{+y>%?s< zDw9qv9!wqqqd_cIM8H(0TCfxRMim-4sS!q^tw4KGOV~WRP{j zdl^LpJ|X1>OaPqMgcku0!l$ki3`1E4QdpBf(U{xVP{Gjlduo7@2=@We13!qdQvv(U zqIBaV&;cPJ7%^mv=HbJK1aSfhII&0I&DqBRUlV5>-@>be;}e3cGA*kVh}o|W?Z-U{mH}B&!-mpIRVGQ><7sK384xanwk{Q zT_RC~79;}7Lo#J$Wj1*yUN^UqG^=>K6a;YHRrX^C`1l<9w=6{a#NpPe|`U?GdAga0ARP`&MXNK7o6g#9O@i z7z1=07GPxiA_HKiB*Y6?cnFJ4aInTg4t8{h@b|lbpT~h1z=fM^!xext%G}N_ieR)c z90EpILX0>x{^HgNL7$U*YcQnl6~A2<45A?03rv5*2=LT>iWQa`@EoAS7{$a2Cc}|F zN`xbAV{3g zHqDS&LZ~3v*rdO6sGmK%laN^?CI&+>0wdukY#k`{i;vF2Dn}wEzYK+}0$$5r#C;J9 zDQaS9Ehw5r#^@3cUo&UpRPtnkx#sUX4bBt8j4?EjA@(1z>_8+&0n2wh#=3_N5$rG| z5PMipJHC`S;{KY0o&DO&FY5njLLNSZa)yNeppKECBiGmOob>*SiHc0>7|25tuS?%8z&iW|P*0+9mm zIX1scSVK`zLz-i5W=109CUOb+BOzhhGgB=FQPDy*3B(;wOqV&DcEGR4A8QfeATBbg zd#@R95Or9PP$q4E=tih4nGeXOXB%neKx&YD&Lhe~19wbWnQPt9i|lkr8I}Flmb7J# z8ss~`BuiqKJu0ZIT;Ds>`;V$Ab$Z)<{!670HbSH*SLqD}q->`hgpk>j+BI|dH+J`n zXf?Ji%&}L?D)J_JFFqIa0XIg%FD^*3OWA5|hu-vgf7u)2j|efB3|2$*;> zZJ2pJ52)fzf$ZIqbit6zmT>OvdwU5#V2#HK_eijL%AYH35_Is0i{*lV?z!T-e+L`e z8OKsr=#!S0J9hI&?TtRXh%mq2RFBOQ6BASnxj$6)iF0|Bw9m51P{G^`*y@D z9YUJ6HY$upGvgCIJOVKk32G%wQlaxzZg@Tq2Qy~H?9tNd;3%{?y%k?GU2m01H#~j1 z4FJU1sE7o)bKOP!7#DMcb?-T>_0*)siFrw+n_$^Z63F=+>hb9$MMifUtmyeQzp#lz zJ+VhetHUrSVI6+-KAD-DALsx4DgN=I5}J8(N;)$+-t7|~KfWGIU4(K62@TTu2g8`u zl|3f%4NlWn9n&p2CVDFyTrvbxtym!m2Y-b|HhYA53a&O!!jO%};d){f!{ZH62ed+2 z_*C1sZzqiNC>;<|1DQ7&R)36MYlZ?ETaJX3RQ^@Zz4z~4DwQJ#V6KVakpF6zkv7!- zP8(|_=w7gJct|inhzdvsbT@XsdfgNAgp>kS-W(Ky*yO1o`PKQ^Dn~9oc}n8VXCi_VpMfm)&(DklaBpMP>e~o_3=6J6!t;qh zPPYF&o({BcC~8f12w<>0hr`7UYbs{zWc-P#%-xpwKQ)`OVmNB>(~Y>@wsLXNKQy92 zG}4g_WiE)+!-?f0=d<0sbvAi%(Fz1YC@H*chOi{K2-*#@!=%SJuKwGtUR6 zaH)ynR`c}fWT<*Vv3!E#=S2Zx1%*n8Ih&N{tXV=*(sK`8ni&mc%9r>hv1_HJQUn8x zWvw|7wiQApbXQ?MO;kq9#=_znq8*3{VK54+0>Z>Vu`Z7@Z|$WA&nBg(GoWO^1UzJj zaU#4b2rDrpNQYrva6+^V$`~`eE@3+r6%!j|(EHN6I=$yGhU6g-mkig5*bo8q*(mP1 zy_F{PUA4Oyp$Eb=ELbOy`9%J^U+g+oK0PP|#wRCfP-?;*P=8i%u05C!HgnvFM&Co>EJPA7UcQ`zNJ{RP zE~i^cz}D@zD(BMbT?p` zjzK`3x#G54Y6CXO!AL11vK3MgnlzLpLdOsujIzQKA4+t!I!>-yxrP&d4kCK%(m)Xx ze&QxXHt<`2LU(|pVZJ11!tk#Tq+e4<`2SPee%x}m&cE9B#M)`|i?7Ml-4B+}Ooo2H zV>$b|v(;G3AjSIC=J^%c#!4Q}EHT;pef7GZ!=(j@FIJ}j782oHvK z@XuU-krPY8Nfd3_Lvp!uV`urK#2rJg4x0OZ{~lAEO=}=?%(M0TW;W@+a!T_b0`pdn z8Qko0ITTw8yfwlLf|X4jTZ~)>jRg>PGCY?0 z<7-%dncRL@R`^J|Q1&}8Uhul(Ytt<}<}ReiENb`nTbJEO5wnQQ2~n4VUwy7$goT(A zoA`F=tb9!p{2XAYzvc9liq+|nDITM{LWV;Of{Ga@FP}NY9Ud((N!=CPE+@+{0r&+Iz_!wUkoz-mr#Ij_Ru}Sr~o9$-b+_0P$cbS+qB~BqJ|JHK5 zJ{n|8-kMQ+c#bHj;jJR2I?yTQJfttI9B?yp%PM|JdTrD0vnV>)0eL)^66LysepYT5 zRdREn`P0WKJ7~A=);K#@LrZ@0L8wK-=%1zBPU-1`vx3#KP3&>7tzgwIRgqX=NS)R{ zVrt4mVpl1uVz^gJ_w)1a-I{8Ao8eqQh@&X$f3E6x6s-kyELlea2rP}QH}`fXQ#Ol9M6-$szW z*yV5K?$7QSU!48*gvxYZOMJfVruVAJr^h6C_xIDA7%DtM?$8*h8911 zq>0@(aX9Ioo!#4n&XZ53(M?Lx<|&-3^OhmvjnQ&b?(4lrgA!maV`B~7p;BOIC>(_C z!$>+Hky7LwP3nf8x3v-NSKsI+zYq<=eF)ptHMAgTRPzIDwnHErC`|I?#+Z@IP9Bk9AMZ0`(FnG~H-QVTF8=*&!|r4(J)W`|sIXic#xP zBiz;Q)vc%%Ra~rlK%5Y0(*i-2!I?Ad9`cW(Zro_Ww)@y)=3I4G4JWZ&pF?mlHXXu; zSD|6952ZTcE8oH*L}Um#gy)=qjgIpPo(*hUO*{c>L2v<aarVXFzzw8-0okc72I z`Q}{=j97hc}we{PfkKzRfF!G*KnY3 zBBoZfcXd4{oL4bsiAO$@Vu7PzM7MF}%YKxibhjCXNF&3gE0Ty6zj^JZImQy;+?;Ol0X1SDSEd z;vy1LHBq<_15Kdi#e69`60S)YROW~%H3zDIvec>S0+BHVW8;@EUx7~4KmrhH%i|F%E=xBU$vOf_<7+5fd2O8?_AB0BG_E*xsf>FMId4@9E! zH~(Wdy!7@~8e#V1Db9K_9eAYQ#_>XFcvu*2Y<$mhq}Gdmq`#x%Ap(yH4`2TO(Ix-a zfa&pxurc8+EMTqqPcLrA{O>#e(}n+Qbo%E*{9^{Z2LItUz5gDCQR4qJK8|?rN40}X zTAOO*g))(ts?%cOyx@v6i0gw@s1N-(Ij}K2q^Z#oyA{yNcc$P30`4(D&wz3gj(|*L z0W(M5TLD8a2rI~QV%~(uQzMp~;U#Nv#GrF_t!})cYnGloZ*o{cT68d+o37K9wwOpXSD9W`^_lQf~gj+ChfMVoBV@v&(Dj z-7r?I7#s;D95`FS*215}-i;{|sMtg9M~Hakz;I>J9>+(BhhcOl1$IwZ!f56emq?Q} z?72Pb!GkblW+B;{L;`MyFt4*QQ3<))f~#0-3+&(TZk<4;E>I~FtiA;H@N(-zh!&G6 z2g=*9Z)Ij?8q7%EEAo+|9|J~_Y5r%+?nUGs;{ne`w&#B9R|L~A$Pj7A8)nXazT02- zO2**sWnuw`u^4%JM7kL4KLDi~5hjdh4Gl6PYl4uZhJyt7NQ0TZR#S^Io9X`>eBR>}VEc?)_M;%o>}#gT&)f^crfwqA%s^NwrST%k9#Qro{Ae8380D$8Fv7Y*9uwAM> z!|DAF!{bCVZj<3hiMkFgs(JsTvz}=A2>}K=%x~YnmoO*8(oF;wAW4dt43CrDgwa;L z7|lyKVz4?sYSHS8#h#P`d=uAb{M|uZp+K6$-0U}{m)aZBjjuFc+^vqe^lDEph-3OE=TO#TJDgDhAwftu z>dn`LS6m6;*kh2#06tg%H#xS%-_srL;jp744>NBpO9_`3T&1qX?Tby?4AlV&+Y|80 zb%60ZZekJxgsVDU=i`69;kWF2^tWTzdvxvz4b(3!s8wKCxY%QrOb)c12nk0r<>j@d zuh!3yxP}{ss7r>j^608Yybb$&9tj`4+lbRh@Lv@g25eX(P;Wg4d~tHoozIb|xe1^Q z5DCG2;wMA^!4WGzG9=zHXbHle%89%II7$m6V~!s87N(o8t7+6cPj8p1{Y~0QRJQG_ z+R9h!k4hI5m$-zy2Af~p+Z;0KeQBtV_KjTISjU~~HBs3Tohh_m7j^_Xs>rLKb~v1< z&WhdK+}{2=3YgoRvEC{~S5{h9hIGK*vDJq>QQQ7T3O1E|TkNz|@;nuBF3D+i0kBWsJ?QUR}58j*jzDr#2&_2w~!VPN62rbT3er z`Wi2rDp~ee@9S&t=TG=J-1-9ywcI2#H~l@o?$+ZUF)r|lIir}yDVf+Mf3M=5ptB$#a6bc!` zFNqaXwM=W=ef;YFU0LIC=lx>M>86U>TxRv^4-}fXRPL<1o2_5=?H#Y6?|BQKs~V2V z?IQwRyYn8bdRET5a?_tCskL66_qx=7Ey!N4Zw7WYN_-0juWyf}1;3rsiNJX*sSI7+PztUtx zxrpDS?Rg?ln#M4#b9SX?pXc<#u;%(#4UZisE!?R?9=m?*-klOCR`afFp7CI-aFUPL z!B5>c97dH2S^z}H;BL46e*QI+LPp)~hl=snuV24&KS0NpVRS<76`7^V%63zpO3UgQ zWow`Dt(3Eh zk_kG>&ZGOB4;_k>t_hEuAG~uLWj`|;TRYNPQtt;C-wBv+db80Q?sHx8{5}KkS(Iam*38UfS&Uv9O!HJDgD-JDj{@#JUWG;v0up zK|y+F+m5f*RnyoHobT{DT4C}8Aj%L38AfNB_3In47kG^YQ4!8v+@S;KV%ogc7X~(Y zBGs_?yqvIbINJ;@ZtcosflJy|M*fL1pE$g@W!WI$ z=Lh{Kq;sA=O~L9Yg_;F>_miYghV9GK+!#P|4LxIAe0&p!V{O(G&R9#?JGTLeq63%G zh^B0}7h&8FA&QC5jEw*4&}b=|*upQKbTz|WkX3ecb_!w^7=Y12!DUYI!Uds;(os%m z3icq`kU-S=D3IjackV=)3+70Ic7IOPmZ%Fs#Ts6Get>I!&{(RbGUHxn z2UbW3Cv2FiMfTyH{4Hd6CjSvW?c7JL zW#IYc)@cGiUT?%XW0dQ4$6ETX!t7w zR2W)-JJ?-(w3KCWXaJ=S9X*N)G7QLw0ejtM-p&y+HYCZQM-zam2z)phAy0 zn}Y1r28KFK2uyFshlU+3Y+u7#0%(6sc2L#Q3hjS=B?4S8#t^gs zdbdTDfr4TTqbBP(rQck`19+Yd(Dbf-`(h!%A+}`fvCmOm>=qS$eeV59)JtE-!+C^o z2fzl5igM+p>8XUI(6#rMgD5B|r2_u?LfqK|rtJ(2ndpu$qHo7(8+S2LECh!XQ`o`nhl~}aWh+lZx2s)5BAmljt z{_fSmiqs`=4!pJJRAD9C0XLD#4=%Lb2mf$h9q|$d0ZtUE;oRQTTDDzxWkG?`^GP5j?%`0N zx*#fBfOI`m3n*rD2Bsbs7PdOb;*Q6d2?E!LM?VQaX>js>5hDv1mp&R63<<;3tx%k1 zbBT7t2T%AnTnbabg=t5#_wml2J$_sj&>AY+oV>hmzkV4KaaLCuD2hjA=kCrC;9w2oEC&lst;}*(%@x zWsZ#HohZ}WrF_k|OFVvw+C{lWGkYuX|BnKm-pVz#HAi7#X6^B<~Y ztoQ2h2+GM-c2q-t3aIC|?JvK`T z1LNuHJF20l8kp9rxLSCy5jq!{p$3QI3LfGj!)qxsRn3xYU3cglk9+gQFB!uVPM5cz zR>&~spQHPc()}Z05VS!D)Jn$8=$VVUjjs;3;_xp_YG{8FWad#z#Sd=1H~4>k+4MyX zmH`=bHa#fju3fv9`sga0ul7Y(_=kKR_!%F_UkSdeR24Az_v`zb&EsU~7MKD_E%3qm zbylNDtiSNbPOBn$=HlvuLx&GDLv>bVoi_Ej|Nd4tq(IcIWvbvo2Am}{m9FHK0A59C zG{Bq-%Dt zY~QYh)ol>M@$Hu51^M~xwcH9W_k`~sl2$c&?xO&Gp+;`aV#Zm6ZQTA#=8U@&E`>_h zB)cADnQ}YuqiTZtO4S$E|9sc5@>QHkal_ILjems9ps!67XSB5u!4~VD!bVig`cu)@3^bv7As|er3HMK5i=;8_UX}<(H;^ zUz596M=i{$ymh%E9-@Rwgy~mH^0Bk_? zxqI*4lX}jQ_ghb#n5hJNbh50d_h~YPFC~?T}_Q3Z^ZZBsEnLCeo6k zDV5?zK;6>HXyEOEhlUJq9!z3p>)$)rm>?1wE*FdrhPB(NapH#Obd z%EO}S$ZQug^sv^V$=tPnf8ry@hA%Zp*?jFF)MCbq^p^5-hsc#RMq_uxOOfxDkUYixrQVz9!i&PVt49;jcv8NM}aCQIDexvNdUJ ziIGjG#DYW2B=SqXJe=^)58~uyX_sPKV#s?uNX0tBq9$ludpX6R(iEB zudwhU9$tr~HCI``zX~>bdf#!v?8@f!!h`t&YYW~|DmpPOvA=sXP-aXmzD3OVfwlNGx62B#jM*@J0Qw`6$y+jFM|3@^&MRyH;27Tk(gW$xVZHsOPkOru<* z&=_rEBnvBTrN{SY_kO!!6CoBJ=~M44;;dB=Oc`*t;)cw|n()Q#+do{EPOyF=wJDA< z>>N)`piJYroOr`iVt(F>pXY=#tN;!GFS6ZxOwaet^r?bdkd!9XEwGMSU)*XhCHcl; z-2APMX+nQ2sWdQvtl+n|10{K4?h*4vM}LhICla1dN}F>%3&(+PcdT1Tiion+xwBqST+e*4+GD#=+R?Sc1})!N zot|}M`z^C-Mm6!0B>kt)7#JDdTX^{X!=r`wI?aHyUU{!MYu{i~dv<&OXGP9E+trn& z5=jyi@2-@UY=r47DMl`6io|IcRi;=sbB~enk!ns%~HR7S&FPQKkZ_{7u z+J_=H*ko_?P4#xA);;m1=qxx|DfVSzx z^dGGqs}6-rV7kqy2>Vm`}m2z(nDlo^4#YjPC zI2%f8mWV{|4-@K4-b}Z}f6ob0b4661SIvARm?X+1u;AS*zDpA#D_cLi8rq-KJqYri z8?s3sKh|h??z-*fyHuR25&i1tL^d_Hc_^0j^(jeP(_wQyr{_XjI5N$8J+$rhn?onJ z`R6`KIi+s=akoN$gYA1Vdk5LBbonN|eRcPqa8~wAcGe@C2k9JMZCC!h>o~t-&nlIf z8ka>urhm%Lg~=I8ZVmlY9CF8eG{+T z$m-!#tmeyy8$E?nGT z82j1v_gbLDgmZmhYUcz0x+&XPinO3*S&jgcQetK$-#pvr!8~w08M- z=FJ6n@t(Nr?q3x4Ra}PmS+sVE=bT01G56E9!vfzx0)!kjXc6XC-gM>dbeihj%jJIi z^Uo#^3@w(IWf(?J{*@b$t!cK6_$(y7cA?BsmHvZ%_PFmRs>$S2986xPzyGfQmI!J0 zXI7(r)`L($EV)YG)3|aj!U_;WP>ud`HF%;-Q52CnVuaSGyQ{SB0NwH9$8iWjyw*al zLK~n6Xq-PK7B!vo-g?{aLv_t0;1kqGVW3{#E^VU!8sJgjPry2$iyJh5>eTT74e#d_ z;1{Tu@0+gDZa|~goltLS>feHV=Ir~ITp24!2flT-xmgX(NxTpoJ#xgudv!mp+U&K? z-06NA>$n;M^&T??_1;vEdct-`z0Wt-t2+|dj(X4gDeawOU^MCgn)?7-qq=pY-PQvC8#O5S zf8C(DgR}3tU)QUZWiIqBfU+B?*IXKImQz5dPMtiwSNGGZ&EBnBH#&re^R-6%4?sQn zEY8<|M7`8>pyo(ZfTvM^&FensI@8zZ3E-7_uj5cJ&qtwVSuJiu41Yp3 zzblM&>fCXa_l}|-?-m#OR-Ek{AOB+1$ngzEMF0Q>!AV3xRJXaAHQX#;nM@2EGQ`7s zbw91z?24k0HgziR8I4#q*ndCN%jYcAG{~EQL8u?cd0$2W01H6fP zFKR};FWm-Yqh8K;xP}G;>Z7mTz%1ZL7mg!_cX z8TAY5W#A8}w*$?o;bzDJ_FF7Wo;=xudu=AP4iUP_v17-CpV27Z@Nk4ypgxv)6hsgR ze-IN;ADsyhHW06&K58?92m~<{#6M6Y?>M&~0ddY*cOZy%&h0TEW`ihp_E!zUuhA84 z3To8<)<)Zdo!cAht#xXpQF*pr9Xp8kK-}d#CKHI>sE-BB&N`N>?6WTjD~O4Ww)@rl zOza^3KZrY>efyz4wp-mu=Z$FaM6rWNHW%{1;#3)rAZB-fkyB zYcUe9;U!?9(a6qSySV-K+qIVCD&8^0^$&miBXNhLX8+ zC$|q9Mv7j~XIcxAv^wm-97W+7Md1%GyuhMGi?o*HUbK!HxErh0DrP_Rlu$rCp(vul zg8>~`T5mZBVxYkw{7fdXdi83pPvhpGbq|-l8U}-bca|;V{S_Z66PZIF3Ue6(|g-J9+03QPh zMkCX8I+D}UIGC5mGtWGOUa!|$UbEuWX0IbPHI*Os?&Zs^TR9pR$8Czjy?QlykZ31&61y=&)@(4 zcivsPl#1$V2I_Q7v)f5SCSz>PoJ7qFu+n5=x5Yxwm>3>^@=0dRnnifHHzgSVS@`CX z&5fz5s^Zg6KV{A8)oj_a1)UI#1jgI#{1RxdwLr;rIS70KtTPz+!Db^eER2a$rjVAF z#^Aw&Q52s~uPM#mn6k1mGBPsw@PiNdcE=8Eb~|?%45V7Ej0EmPP0k?EmI~CFft!KN zCKIPD79v7I7&Cr6lO|7Q*sx(KTcLw7MYou?A)<~L;3kw?RNb2dJ^<{ z5-k>zP_xR!0g)Q(ZE4tm0^kVhJ@+0(VXw)=PZkRfAqWi&Br!3O!9$0VoSaN@axx(y zGIV;ANVB^Wi^amhg9q8WcduG@?PA~l{Zu%Y4+=FHh}G$cwOEJ)dZ4Cb>WZ3$)*G(| z%s?S<8r9hOK)z1LVWW}bRx1{Vx;{U5ko8aeH7aL)Rlgkb@Ge#Ca~PHcQUz2f5N)vFh=v9ZL)#u6MHtih4^6w>Tk zVzb#ed-g1ag@qIr7IOOZX-=IwMPWe!MTLbF7Z+1nRz^*uG{$XoI)e0if^|BA0DmC} z5Q0#v6@S!#p%HZ{sXBCAId;H?INo>v~Smr_K}fv>CuDEojVg99ZlD+U5Sp4rgP`cL_|cm@^MLALz=xcP*YPwX=y3v z&Yk1jxpS13mQq<+Nkv5km6erLS65S0Q$tx<88wxaTr!(6*VIsJ{!ey#tJO+XRaJw2 z6-6O5G_=9C;LuR?dObm5VfY6G5D*YRXlN(_0Ri~?`x6`-OjuYLk&%(KYuAp5hzKGg zA}|;X9{DUJH;pvAMB*cOM-DoPL~?zk*(DMmL7H76@evF(TzN<$k$5Ku{s+Sgm8k7^ Rn#uqG002ovPDHLkV1mr(I-LLj literal 0 HcmV?d00001 diff --git a/docs/cudax/stf/images/dag-sections.dot b/docs/cudax/stf/images/dag-sections.dot new file mode 100644 index 00000000000..bbf13630ee9 --- /dev/null +++ b/docs/cudax/stf/images/dag-sections.dot @@ -0,0 +1,69 @@ +digraph { +"NODE_23" [style="filled" fillcolor="red" label="task fence"] +subgraph cluster_section_1 { + subgraph cluster_section_2 { + subgraph cluster_section_3 { + color=black; + style=dashed + label="baz" + "NODE_7" + "NODE_5" +} // end subgraph cluster_section_3 + subgraph cluster_section_4 { + color=black; + style=dashed + label="baz" + "NODE_11" + "NODE_9" +} // end subgraph cluster_section_4 + color=black; + style=dashed + label="bar" + "NODE_3" +} // end subgraph cluster_section_2 + subgraph cluster_section_5 { + subgraph cluster_section_6 { + color=black; + style=dashed + label="baz" + "NODE_17" + "NODE_15" +} // end subgraph cluster_section_6 + subgraph cluster_section_7 { + color=black; + style=dashed + label="baz" + "NODE_21" + "NODE_19" +} // end subgraph cluster_section_7 + color=black; + style=dashed + label="bar" + "NODE_13" +} // end subgraph cluster_section_5 + color=black; + style=dashed + label="foo" +} // end subgraph cluster_section_1 + "NODE_15" -> "NODE_17" +"NODE_9" -> "NODE_11" +"NODE_11" -> "NODE_15" +"NODE_13" -> "NODE_17" +"NODE_19" -> "NODE_21" +"NODE_11" -> "NODE_13" +"NODE_7" -> "NODE_9" +"NODE_5" -> "NODE_7" +"NODE_17" -> "NODE_19" +"NODE_3" -> "NODE_7" +"NODE_21" -> "NODE_23" +"NODE_21" [style="filled" fillcolor="white" label="t3\nA(rw)(0) \nB(read)(0) \nC(read)(0) "] +"NODE_19" [style="filled" fillcolor="white" label="t2\nA(read)(0) \nC(rw)(0) "] +"NODE_17" [style="filled" fillcolor="white" label="t3\nA(rw)(0) \nB(read)(0) \nC(read)(0) "] +"NODE_15" [style="filled" fillcolor="white" label="t2\nA(read)(0) \nC(rw)(0) "] +"NODE_13" [style="filled" fillcolor="white" label="t1\nA(read)(0) \nB(rw)(0) "] +"NODE_11" [style="filled" fillcolor="white" label="t3\nA(rw)(0) \nB(read)(0) \nC(read)(0) "] +"NODE_9" [style="filled" fillcolor="white" label="t2\nA(read)(0) \nC(rw)(0) "] +"NODE_7" [style="filled" fillcolor="white" label="t3\nA(rw)(0) \nB(read)(0) \nC(read)(0) "] +"NODE_5" [style="filled" fillcolor="white" label="t2\nA(read)(0) \nC(rw)(0) "] +"NODE_3" [style="filled" fillcolor="white" label="t1\nA(read)(0) \nB(rw)(0) "] +} diff --git a/docs/cudax/stf/images/dag-sections.png b/docs/cudax/stf/images/dag-sections.png new file mode 100644 index 0000000000000000000000000000000000000000..77a5ca2030d059407cdf12721dde35990fe9365c GIT binary patch literal 103821 zcmaI;1yojD_cab*h$tiY-pwrYkF ztD_*EZ%D&KVzk%6r(y&ImaJtBaeuHLK71l|T~#%N;)RfsQjxbR7MA4b)4dB#F>Z#R zTb5jXr8N_^b`&PJnkPTmvHDWr!M*d-mB`=^IbMya7FK+98bu=$h<*Rs{7c^V z?{*5FHJZKULtZpMDAgLS^rR-`f8U5kD>*C_xuTWn5QwxGYMs=B{e4mNfWSa$Msfr~ zk>m3l!~%v=f}`P12YDbd12;}Hi*f0d41qEk0iw!3AfRm2G8or~8*4#Q%6`V*rM-)< z*Le4djDAwGnN%+KL&O?>u&6t~L57I-Rnj$t;ruke7wfxWuhxn0;=07;`@HABqt_x4 z6ezlV*G?#%ufpFAR#sL~F#0I__}|#|`)1>h_aiSH6=4g9z3LjBXY$l(+S3w z=_$m{WasK~1wES;Wfj5`RXewpA^u_4yv)tT%t`-ozez`{KI;4|_wLH1<}>qeS*QxF zsiJH=2*lcLL`SH$LmUOV@65%sLaH!hPR4U8{R?ducW-voL1GUUo!`DFh)4!%YSDo% zjP)CW;XBp_r_Up1u+O)2l1oFs=$yPf3fai?$cQQFo}7C>e=hiC_FAl9X8#NqTAJve8)TJcq9^Yns!GWJ z&fR)o^FH4?O~&Glw`DV0+DkO&RYYyh$SmBy16q zSx+Yk#N;M#NZkBIuXpfih4bi?hXHV zh*bvyA!sCY_0NDczkv^82>$z`M14Os;QwXmorgF5mXPl8xO1(jsi`Su4m`$)X4DS6 zI&JPP>0R{e_>;m||8Wuio2dVvUK4NVz`?{MKj90GB#8CjEj))S{U2M1-KZUpb3Qw? z5*8M|7?G2eO-f37fpi)c{mXV9F0M}>KmPV=h@~E@>TgD_%vIKxFJI!MboBJ#v^6{5 zyERYv%3V@Y(r+Ha7aapb2wz?Pm=qg4?)l2m}+9CtPSwPDeV`Z$9umQZzpf-ITWkQ-A-m zmRCs7ffFH#(&@m#%gg)d(YpyQALUFfw21dBxij5VU^Z(_A3@kF`++XdMdy{#bq#xf&+YM4s@I`I=Lu^()x<*###p7h+ zPN{z(CN!HIDOVvrszqXEQ?n`S7M9TTjKg66BvkEgj%hy&Y-!71#OyXn{SECe8YTm# z_e@$Eb!#$jC}I!ec-K>eFqdlGpLTowd+(@qRLZSQHA!dbDKATorY>z^hWh)Yx~qon z!xfWlY0RygXZKr5NZDVhS>6{$r~I;>k6*tZLw^%P`T0#fI;P}q}%I<%n!`dJ71Z7x|Pzj z-Dl={|LjWu;ZUuv5w<~k};Hr!(1~^v+2FqV*LJPeZ zs@LU4`fKkNo~G{iPJ3ENE$2!Ybbr4sykR3@$K1z$uKlcbc#@}|eqdXGa5y2Lf3WeE zhB_tcuZPq}>$Xl#kDP?lxLF+J!V=;rlEHsQgh{Y&X6A*2garO0e+a{9mrPu%qa$L7{yhs;W-$hx_*q&u)erd5 zn!Z{!7c3@0Mm)GUl>X=_1_&`CNx$l9r`K|gIuhmF>l+)`jHW&@zy6#&^3t?f@)Qb4 zThnv>6Y=bM?ts<#m*ayps+P^?f8@!$JRJ8@)|o?3YP^r0I8L9UItBFfVM3%ZjVD2L z(7HDED4oyVZ5SK~wQ*bWd_9@c{UUGGiHu{pujX2Ryv>h^bsZDz&0(QrDHAjM6>j^b zVJdoo2Oai*>IN5;lly+wqFf=bx8&<42!fzCb#C+7(!F2igDSGc%1$5ZlrI`HcNP z2Ulc?I(UxvJeBOJVURrpVg$|g^&1dJQ2z6|p^(c8>LW31EG)aXZz-JQA-kFW#~-kU zao{dQ1803v?#NUb(mOt$IDtV^Sn8n1M|MK|%TxKg3 z4U>Y8uOTZd>vEms`I{Eva?gv?8_PaBER8gh+X7feeb>-lSXhwY<3DxTUEnTtb9T00 zA1=OhCLu}upC5xzG%-1O6PJ{(Ur8=l`G=|7Q@(4 zSCf~EB8+TW42*l<_l_>OPMYO*AM%9Y))-Ct3-@!RO!D(?wcnD@1` zRc`L1S87+$QzzrrjH~DSD;8Xw!a;mnrdx}(ftm*+|2twl^vCO76jX1)*2AXEpx#-Q zvg?cEx|uw^Zhn2Fy!@JgSsH?PU7!#dL2!zenjGvMSp1uq-6N>{gCd+FiuZ{JGGFsB zMax54cvH)pqM)j(`nxWLu#ua!d++Eb?Vfy!)^I4*xV2z< z5TeN_ckH1K5x7GTlw*=6A(FA|MvIep(4HKAX|lBthxyi*LbUXf^oMZX;q5X3Ic;Yj zf!&C|0ut-59TX7ekmfqOx*pmHcdl9L>L%5oF(L%x{yKF^_4yB7-QjB@+EZ>RcewvX zScLE~YcS2nfbm;)ftp|Df8hxS;r&IcVlqPe^v=EEi-{&927lG(MCXh2xuV?fF<5Hl z=S-?`h7OL9m*$8n5eTh}vOR^%&vQR7d@+TFzV3g1VDd&jd;HjHXI)4lXx#d9GWX@_ z#uDF%0nwff%C_1@&*H9@vqJibi>v0gE$$|l8YbPY8#iu*==|!!E+RotRR6s@ zp8V%MxxnP@7pEyYo?G~Peifg-1b%4#;Z;lSC2BB!Sd1Hpm*J#GeA*Y6A9r79{%h69 zZw`l#4wbKdQ}fWKczp}#A+A}J+8u6n8=x0+iC?mR>Cxb!-a{{2o+ua_xN+7e?1hS0 zAnJKCqptRC#7m*3Y(-FHenngaH3GZXBY=pLYGqyE>2)FW$m{=;@N0YPzJ877p@hPK zTN7f4pxRmAHFJPUX(15#B5D4`FU%XrpH>Zp8@#%%)RcJ(uteZ2ClekzbzK0wVq~fC ze|o*%+bekO;IbX~@a+P0__UTUG9IM8p#JKmKH=`z5lK0K0Rb{g9=;g=V)c}V3eM<1 zw-82FBmV=Hc>Y(E^u7kZBN&%}=um&jkeq_4TCfHslfS z@Zg)+H~hZ8Ff})S=x=Xt&w{O@qVl?8)ZE>DC7GNJOhcATV4@u4q2hKJ({ zqc^koQe3|#q@{IVIH-MomQ!CzNl8mfOJ84KO-+qY>g;f{)%ray8TZFepBz>PTF1uf zWa61LcNwX27mpE^l(N)Lz|OHG4D&^D=a7&7#LWaZo2YDLEYHg zyor4eahG=Y#;!D%;MM3b0cJpOaBzQr|MNE!HEu_5TwH4F>gtk{lN^K0JI@i z4rvNJUgy6QTwGe`moc)_CL;^MftxKtA1_BJ-F6SdEVSPSSn=)EONE4H?_e0+T5 z<>je+AFDCz*9V3FTfxDPA3t9Goj>?@wi$95=ZkONN+)^z?3|n%*RKcN{pfnIb}*%ot&Jgwhz_6yY*n#nYSR*^v*)8?~>Z<`uci&0s^TdMVhsZ=tl~}r3Thk zR$+MX(zU-jTDiJ%4+$v+_bp$YZ8r_($PNDfoi(@$2gz%{@~b`KL4J6lQlti^(Clsj zYu0ii6f6JMm;BX*)e8#zU?#mfwpT6a*v(*d@eEq{ckjNN=Nz)>{16fn0?#d%$RVev zm)v8lkjTNv$QZVkMvQRD*1SVgN-ScNeDD{!OwGAZoCWR>5?)-MZG#=N5nG)f?|@G{ z{1CL3wxDfev$Hk%^{c~rARaNdy{V?=M@70U?z~cp(mOHkOKO7u76Gbc-}CIci+hwY zad8}Ohn6ZTgMcvHTwURc`gQi5t*v66Zz^e%A-$xgPS3YTWM*Wrv9a;;@f8;paad2C z@61aSh(u#WY7|qYx}aT{=rB`JHIIx`g8}U9><9@7adB}$?5U`%%;IJe^L_a-G*m`Z zRMglwr@HzG`~{GKda*W*r=vc*kbYHi)S?>wXRGGC35blDXwB zVz$W+PEOl@%BX7XIMVa-Oos~8hKGm2TigyeT>j4Z9)` z=w0T^&FX0e1_pRS-HN-x<-YY%Pt<=hDn>$}%Fkc@?spUI8p_b^ z$F=qKyk1^j9Eny|R%8SORPz<%fBt}rb_4BJJFDE`Tmy`_3idCUp zw|w;~qqvwczI(FLOvT8EW@t7qa-`HiA&K*L^XJvUT!=YJWVoFarFjqM>&;)2S%@Xa zT26i&9sM*$RLM(jy*iNXck|Zp&`^WZ_EZA91@*@A0dh2dah%ct;4IWQE2x6hxPoei2+&NjbmazBY$ zdpmZ0SK70KwzTEQq3ZN)ekqCK;$W@roF5`qcmgxzf1n*`BY-hmU} zyFSV!BvAwBA!N$hS{iEVNY0ne&NVqXInmK7IwgBPm3sBgWqMzcB#)fCG`$8GvG0#X zwbu*P26w~fNjNOOu~=T7pFkqW`uW(T%i#Ph)J+2?q3vjy@Tz$epl1r78}Yq+5SUcR zWK?1}ORqm5H<+q+?W%70X+NYX8y@#kBUcLLCIMESHQlcpAyFD~(SzLdc<8Xl&KE3) zo8u#gqYJ-se}-N?|33C-xGWbJMU^b{+fJ~6|WJd6nt5c%I`tU=UQ1;ciMDyDI={K zlYi=zLmswxK9X2xrtj)1&3rUnY8cc5y`3z7HGOp{F;^L>QN@;| zq4vQJo9D!!G;b$rroYrtwq>P^R_%Lgb)>RBdtggbck4a;$?HTAV(^Hc8lu-8#SN+I z>+^e_A6HgZu8)+CkB`f2i^eEB@q9ezix0bwp(-K#lsmc6q>udd>v^$IB8=1? zUgx(5YeW1Vr;@7-wut$+u_qkIsh+37orVwsV<>`XWa1Rb!q*nm_!hUPV+r}7`b!RSs#MTJ$WP7S1KRkQWR2`+dz`QhbHamAR=H$tjl302TCIBnLMOqrA`q?blu|4z; zuYQD7#gCtCP0tT1g-Qc0U3WxRK zi3wc=g}#`W7&xMCegs@-?xH7kVyPA9*kIvG7biBq>!JSwMJYAS-3t=}0s_=)*E~-a zy-|I&Kj#Gqowt-boFEPZd^sBBN zG_CTxAJ43=o=iGT7wb19tZ5=nI7#JAO-%tV>G0QlE-ULF9*$eV8|tCI9Q0G6eZV?) zzD;S3MLYCe66dkY(8vFB4q1)8Oxl$}6hV;O=w3IuE}n&JH&0BQ|4J8u?ciczSxjQD zuC8W^hefU65%$pY3cM;?Z!7o~q*5U;{hC5nZ?MAR8>3IrtLWq}%aB$Z8sy~Uq>>`% zE-TX0(@h}n+b#r^lrRMkc45iN>ETcvyYu@*{+;61Ie7WC!luA4qs1SC!1(9K+2!R& ziDB|;Y^6a1<4v;K=L=`Q>}1>umB_MWJ2*?VBU((zsnxW#&0oEGYB9`!`8tw`_v)`E zj<(xTrq<$v=;1?+>XTLZ&1Ku;9qB_}O@ffnP#*i08My2wLB6>@M#_T}Q&H;`<@h&M zygmvJzXk?E>XnNxD+)DAAzeJCqf2ACVMgzDeDofLHbFK}`<*`d_#2NFfhPU?v*RUO zJiZr`(*e%W6_4xSsD7>PpB-(>9556lXgr7rTV``-&Hn8W7pbPNW+8T^^*+tiE6CUP znv6`hf#-?Q$?jszb%bFAMGy+=4Zxjxb@o!yx!(24LQgviIUZg$mH=B=Ww-$OvZ}I@ zh(S~E4MH&G<-NnbJ^=|)QGi^Mt9K$bo=Dzs84{-+83lL%RHyN~9p`$FvqQ8SHvpox z+43E~`uUL+La4H9CF!oTkraFBSa`x&9VCF@`Qsw33OX*HG`zrq9VQx;&<5;x#?$6j5 zU`q}j>|N>Ds)wb-1pf%ip+lDfDRqiuRQG0{x0u3p_bXpw+x^N5mw5%OzXn%( zUeWO>9?a|GEL|eJH9&sclWTuIXP3z1xjO%ao~`Y zF&(~cs5WU(435%S*K5?em(C@y4>)aUaVRdhPy+WLg5jYyOX7PR5)N4aeG`q2ks3I@ z^#uxt7bY}r^56eMO$Vdo)p|Ji3#SJLw)vpmq~>PkdScB~^>k}6mx_$FNP~KKdiBoL z-|2e!q^<7xE;@91QlU%FyBs-uo62E{X(zY*NZH@>`dL_Ij|k?eIgK0{_xCJG#Jam= zKQ)q5xG_1ft!?(KLh04Yuk>doy_T~t!0>$P3yQ-WiU|AsKDhHi_8zv+?1Q=`Ha51Z zM8b;1BZP~*?eB?2bt${Z@!Zdynhf&Uf*@mR6f6jIGc3eW<53$qdZQqzZnpDy%Ps35BBJq}L&s z6!wiwfb9z@)h;dV-R)QXdns#dZ`Wd`WY4Ysl&y>%VeAm#KJa*d@PMZ)=#<>J%FK)B z4f;{fYE!1PK+`Y5ydRH7te56w$1fYqEw77Je_78A2$_Gi^*8Zj*h2udpFVvOPLGTk zg{XVeg@&j6((4LpkY~@H(W(r4um=AKZ7$?_9%}!JrD|Daq5+v9R-S?R@pA{9btmm1xDs4IWvtSkVVq-4~}je{&y2=m)Q~<%sVy zmwx_lGfgtGj%&tu=T^Fz%Y(f)#Uvd<=#=mV7;@7+UmhlE>Ta0cgH zNRUE3KBZ5-S>Dj;M>Lf|#dbV><)6(Y$ze~HmX^@di5edyYzknxpt;b{D(SO}&z|+g z#l-;u!66W?oALA6BXZoG1D143e>~$A4yzd4ClP#}ON;R_-=-Ju2BFKU$W)~F`Q2~U z*i2YAE$B3?dQd}IcD#0NexQ3jF3CB6v20#*=BLXR4=UCmP4`r z9~2_X&T?hTJFKhf%dVTvkG(E71zL2YTZgMI-yBSRifhH8ew>qEEY|+_NBy<>QN?xB zG#SOsQM%0R>;Swcj5N!AvK{mt^sKgfAq-JNS^&Geyso|^Z3FIJTPwfh6+E8#o&TJv zasEKy5nj&oH>cN8iE`#v*0mz-SN`PjI} zq9Z4yQ@Jy=rSJSpS2kfx8iC6fg_OS4 zbR#63&8Juxot~43X2a>=vL~x8tL-PbkE)sr%;Y%Ly{Tq12OkNP(!L~Rx+&tXNMTCB z`M$GYk0bxdlZ9E)lYqzE2Foog$#JcFAO1QlSx^cG9XlWWkuIg=u)W z+duqQ3-DI^j7iVFtj##{o>pR)puUj)&~JNvhy_(fv7OKdEH78i5HZv<85$bu?lwAG z=*T5$@9U3i8Cvbux!?3Ngvs4z>Ep5Z9c{N}T>JNQjE}AQ6JxD@3B3z6Q)tjs+Lq^1 z>YcLIlH7Oi%X(RHSu@yw^)q=w8ATq?e`P3DL=)X3C3Uc|fi{fpN>g|5O5@;bHb+K` z#>0oEH^gsLwh0^AldM^a67bgVrM2baW&HelBolc!W4);v)+_#Lch{D*ot=$M@)OQ3 z1j_dl_2zhC;?etDXj(_9LjwbIJI-?06nZwlcY++J8d2BJb^qWts<^UYo!!Ep04T)zM;{LU^&Po;mh$dj_QzeErII$PWkf>za-h~kF{GtE<0t?EzF)S zcGXTMMB>kq?p~Y(Q(g{Ug$?Ic6wz*nyQy@Qo*tjC4ND#7obGe?ZQ@;5BHbvF8a&{- zS`1DK6lrzA5I7il;j}ZFnsk(3Pq(@6JGGYd2J5Qu>s|TDQM#zeNN+szZVox%GKBPl zKO|}xesD;}!NJKMTvby$NR_8fU0DqOZgBbeCC=DohWH`+vufO;o$rXrpI0|hy-chW zauT{Plltb5F}jOa*gdIicttF7^H=nC(-BTLyARD}Z^#UOK2w}FxN=dtxS7gwbUJ)r zvr>q2;`Pa&18n-sKWR8-2I)s{AWtg*;CJ`Ktd zc(mKymv?-A$0IOcx=LZlq=aPc?r}TMySVU8tBjtP7WOOr?pp?BCkL297o&MLy9-i8 zp)_|m6!X{Ij{j0dv|GGFED$cleiQ{v$jn^Y(jr5@@|I^gr`Cj6BFdxy1Ki!Cu&9JGQ%P@q7)D6qdcy(w2s86(OlL=AY>`eR8YBWcIH}< z(4Lq$iu2I|VWhbJ&?EfdWB1Ow#g4iw2H`xFTzS+R*c3)z+0uf5@PaauRv|@p3o864 z^3pD1nBdxk(D|FHwOa;?u!HYwmT^_?YlmMe3=VvI{L?+-x1;GzXYf`xcC#wa*x6Tp z=c|A!hnR%6OG@8^l0lZM(R0bkQ9Y1bmq z;?IuTf~)2kxb6v)9?cLt$s@@^Fg{B%EL_~)t}gS9!s0*Gfk=|1rOsj9-oZgen$FWT zOFo%!KDT7`yPzgd>K^+P_Q2Gll-7|&!TiQ(1sm}ua9zp?2K>7P)Q(2hOnP1$mnIGZ zu>{7+9}Oi7l}sp0B`KBktIEQq8hS&v*P~V<+uV=;3dzV=%(sPQ{Tw#O8P+eN}N9;Tc`T`A080=Q+)JH=!8nBf<%tk!T;A zHnKCty>DG!#w-1%WOwxo&0leFeYO~3A__g&UscIZ3D~W#o3)mg_=zHPzenH9U5 zd)jvTd1L4;{XfrJqjrqEslW1?9s7HAP4Nhazb*H7q^uzZ;VjUQQX<>O2Nx_+r}s5ZVsFD@mV;#w-twyWjp{zi*%DWrwt~ zpSuMK*Y;D3U+=H2Vny4fjftPI7|;gXn7P zPruB^V4PMxQvGp}fG2$a{(a!V4v&tkUHi0cl|E|Firb6cX}bDjN#jaAa*luG51`bzOLMZ|cyL*NR#Zv! zZH8aOSHIl87jLv=|F-g8>*M!Y`~LZH<~qleBHUYAM&^-R!tYFp2u7VRm9A0y`}Q`e z=W&@VrAeeb12$8cH?0%KMNoDN4kSAZ+nn9p9?{Tz;G{foAfo&YTL22$db&|Bg*>6? z(+U38u0L#X+^DMs*LPviB-PM>*Y1iWwaj%t{0)Ee?{4cG@0xnCaXh(PyMnmgEa;{^ z<%H9JjgOx-zq(GOH?=xezU z#BRY*M^|B+zrVkskRVj(w4r^;eg>WgilU*aTpUmmyS;ubIM4zvZ zjHK-pM<=B7|7`EM-sEN59Y}DW%3a*d>#|TG(QAEyxheI@$C?R0vouSiF1`0fU_7`j;rsKT&*(mGn1+~Hs5q&Ay zjIZA$S402YBW(wU5J(O3yLU(G?rW!ST~u&3Em(~S2$dvj-WxnWdp6YfShA)&T2t-E zZm+%nig9R(hf7Tv=g(X|97^$#*y@vmHu_OdIePV?7cBR#&i53?bCqlAABxs072aI8 zla;u1-ZvH((uz%+o7_8&wh;eR=CGlmn}6vAl~^q>egEw*jYV1GDA4tr<_~N6QXIQC zCMG5Z28s`U<%l%O(QlaUxEGdRVq(nmoBpkV<(r=Ov0y~L$GWYn$*A~^Q_FgX+YBPa znIl)(7;9nb1s4hGMVdmngU}uk>aq-p)i6NEjEX2u=E#Wul)@eQeVDD%(Ozd)V}D&+ z4;xB)5o!I-#;YRtp?~+j%#wxev}>}ftDh(K(4O=$ zJ-rO~t2dvQilr^R)7Tq@*1iMq=Ec1)W`ne(@tI7PvCPcHdw&J?!Y3jNm`W3B&iCir zn3$+}fpd{kP*Um#I<=~*>h-Hv9m$`H4v?&0lB&xt=9(oV^iGUf?xm!p7#Z1C(L6K# z*5Gyoohew*T$N0%4}hK<`mVH(Bvq;5w3j;5R;tx|Xw?C24d7#TW##yI=QYTH!nkc0 zIzY1s{W2A0W$2BHot^`4OU=KB>XJ>Dm8=oAb9l(X&Q61reiT(8*R$YU>tWa!H1(kn z+T7f{Mb1mtt>PC)_xLgN_CE2sK~K;Cq$yDG<%j2qQIswuLF+V=y6!YPM36$W!Wbt^ zJdx$4u%jc#(5!dgLn#9OxnDg=e$F&qr78NdwJAnAsaLS!2^YHm0qjvRB%4 ziI17MosOOLNI+n^z5OO78>p#(EiGs^Bj}(%Nyl@c!4G~yzdk)QaYwA6;=0h^lk>DR z3Xa4{u=<4s!-8fdG0yM@z(hhZO2*>|lCVr}Qd>cNCHcpD1rSn)dwLitDSalm*4-ee z&wTg0k1hV<1w}-A^x>*t6$Bv@6BFowK!RF4`%JzY@}w(S|JYS&=4qc5(LH*8;Aq%k?!&+rgQBkl%=;eBN zT%@I?g0d~?P-Nh%q56LMlL=WE;OjS(AVDC-IbP=rM3}n zEva}F5D?HE$2k9%6PS;8xMcX~Yt;jY;t%ohk31h~YHE^l+4}!*y0#Ge^h0XBIri!a zEr`Fv!mR4%Z_{kwEL$^(07VA9Mv0|cBAOafh68Fcn_68|R8*ImG{1Uf+}${hu=JYV zt~3J;k_HgAwLkFsJv3xgF$(O1f}C77%e{c+aiO4Gh$A zettgi4?rLTH4Hua@Sq?(^y@tjN}ob=%Nd&CJM-;MYro9^x4ZKfd+(u9Z9P%51HDXJ=TIq<*q9jInzt|eGsNHw(J66o z!Y~MENkJe2uPThpcivf15oiUdIgs+2*Ug5(6PFbOs4hdI`kl8%k5XIFGZuB)sT33x z{1r`2=~m;1R9!&(GE}Gm-kp(|2}Dp~VIhy@pU=Qnk3o-dbkxk$RFUk3#lpe@v=CrH zHx|-)uxJKwM&ZQlU7!=`?1awX7sIHapdhosoP&+gJs7b7vmFT6>I849;218eFzIu3 zagkEx0{JF5p)fjhtz}3P9y;%@D1k~2j7nKqd3SGb)!w!G&E!VY%H-GQp{*I*ZGj$8 zxn>7Km!3Eb8Y$0j-hB4~IyLuLg$)dv2nV4@`Mam5I4f%bZV3%@YCcOsXk}Y3H1rmP zPfSek7*NyFHhNv9e18E7BQ4MVOo>}0BqUf^Sj2KUN0#WWns#B2VSPAG!Np3~(ECdA6(0J9<(QyLh0NftBL7<;gKHj=5gs1bxQVYZ} zNnCc&(xIuoYk(=yx+iV(zcpx<^q_y@hwT6$2fOTpC<q6MOPXv%Aern7+ z6%}oPOjT#QC=Kbgs`Aw&UPquUzY|uLsEW!n28N9h124S_ix><;VLhh`BJBh-ZzkaXb8a$|&1LXg@iDJiMm-d-jqCUD^#Zc>Cc zG}K`!#+mp#p#E|tCMJfkHMckh(M0%}dKw-m5fxeBYp=_bW$2xgu$le>aX@}JIt(pc zR(KNwMh7_<5oW;Q;o(S$9wb!60v4pw$;pQQXpip*Lr$@ttYZhC-)=m^jk7?>!vN*A z8DK6@J$qgpj6mnVv%NjHzCNkwla-}qOJD3Y3e@l4zc29bbWqqZAzHetK#vBYoE+{B z0u>&Iwcm$kz$uOMV;~q8HaF$A{G;^xxOsT!+vUv#3j%S;z<{K42LYUb21OdCClqDN z*R_l`|KdoR7#kBkc+m9XyD#kK4T!XUVWV_J{Ew{p4I(R`d6tyV^*5-%fSftno<>T! z0O#c6VtNld9TWYp!5tX+kQ}LZ*(=d+ASNJqX8J1)yj+V(TgMf#c2Y7N0DlJns&6#UiNTEK3Kc3%rbWF^({>*k?G%Rv{cjvX=k3l(D;CD^> z8eI8rY-y>rySuxqYh6PFA4C5&E|d!spYl~8Bh1aAMV_CTIS+IL@|lMzTqY@ad6V`d z*1SUoM@Han>O;0!=#2I{-)RRLrXMtqeo`ETD?Qg50XDt<6Rp)@k_klPpq9%kt5H z!XNYwLJsdSJG;B-Sy?}{V7&AoqC?nRje@h%f^yQ(@Xz?TfctUeT3Q}3j-bX{wXYMT z0Q4iAu1I$Wvr!QJhHdIR;%ge3x0 z$7wEeUo8}pxg9{td7Dc74iORjw2>*>7|@Tcg=OqJC+^4Bi&){sd&lkW>_|yUc64=t zj+3e4CP>Z{C2QgJ6fX^+Fc5c;Xg)_^rh(4K zo|_Ny6es5~?2JnMJ^Ve=$Z|KWSMMgM$~H$U9tG^sal?QJ=%SG$9v;8*l;k3yB9ezK zcXqA<;s=+oycSI`S?5quR(6~D4$G&Xkd;-)KD>f~um2QIu>Tkd0KCWoYky^SW5a!+ z<90#whsOp620QZ|!s&P^9`Vob3E`=>BM_fNANDHK2 zF_A=pPFSB5A9Ji#s_rDg>vmWK&LtX4078FxuqVjATYxJe3qxxmA6J41z%+;$ZAiu)qckU{ZT2llyek`#Nd%(G$OzzFHZab8W4B-C z|95u2o;Cy*L!x9Vs-un3XP_&E(oG+54@4Bu-adQ!6p49(D*;KB+V*9);ZsP+AVmZb zA$yaeiVA_oPbACTA^H6I^PQQ20cHNwzWE(zUVSsO#}V!Hbac*Fmkq~-_X6*ekO0E( z92h7EW^VGUQ*~Jx=$I=)7%DpMbvh6=e~yfdgquclK0;Ut;kCpu>VS?5^bJYy*2^>* z3aAPe;ZvY#h6Vl)S)i}CR~#Mw3VN*`xGeDUbIPW#UOg=NEZ27k(g1q;LfRWpZ-O&{ zM7XG^NW%bhjEu$VeRz=U zt;Q-rkqE$+`2PK&6LWa1=+1?=oK{#B+4@bMWF#bR!2ZC@&T?NWl|Rr*hSt{8r3L~( zl7rkDYP}BUbr{p=dZduT3#Boz*sx<*fDUD82dAd=-Q$r}EnqOC`}hH%x$-)my@~%o zH;^0a4&t#3lJtd>C6BmEM5MmQD2nG_SZD!u3$?|cB}o!BeSLBQf;}ki#Kgtn$wHcE zvG1|WBw9Dx*xEv78yOsop32wpp$LNQ0UT?G+qSjwyX^YQ=kjoJzQ6y>7^-T>Eoxiu zTsuo&<2#^5K!k@<^cBMQ;*R&7$XE?t@;=BqJ;o1^A+r@UP-v7bFIdP0JSPrSeglrT z|2y*?5H<_|U^fG<9xl;Co(Nd#KpzbsNVezBo`ZDywlEway!o(@kh#4R)pxXnStG1K zU%W#u36((@DpnSJr9aC_e&>#8WDc_|9)nW})hj5C6 ztM(v6T3d64Obm5`GHoti`25k<EGX5{7NLBVKH?+hYe zs1+dOxdI#jNJbr?z>3`rABK`UmEB^v9`GG7J#1{h;fz~a496;wdMSuOncQtlRoN#D z!gK)j0a5|>UVJC$g_;(ktEUGvQh7xMJw3e*>D{KF`Quvx=aKR8BOw^z@W5DD8X2)e z;SDeECK{TNiAm_laZFCfFpR$5#=`@#eawdstN`-CP3*xa;A9|LXl#|3A;UNkCHfqc z{qRkXWGm}q_siYIZkmKA?CeAB?d?B*TEes&P*qTwLIa=pn0a!B|uk>14zbQx$>zsGyfWgjDNNu!RmIv+uWTllm5%!A+mt?_zOgqP));tG1M+B*aS{*ifQj)LSgF)uO@*RidOrH8XL%HiW(YC z;Brv5v>tKz(6W4zH87Y4BSHpUcQ+Q?nDCKD_3QC;Wilof9e%Fl(p3LnK2m&v_ZHE$fpDd85 zeuhyGfV{YX1A#mUsfoCcL>0xhwzg0)gg+3tocWH*oeZ)GU^JoF8jEA%NDBwqGSmjs z4Q@`YA%tLCi?+3@64XY;jm~c(A<+p7U#qpdM)8_6e0#bnwR_QV<=6L-k@&T@G115G zin6nZigl`Cq6G3XQ`Giz^jo(yVj}LKm@3N4hr=&QfE&Sa+1c8LlktcwQMmb$x678WJezaXe541?K+NB!vAlCVKoWcf5;>tqcvbR23-iU?Nc^fI}b%fGit<2wsqj zI>dslQ*IbsQ{fC{Ue<*`9Ss~xi`!h3SUEQdnCacS5_794kvD-CY!1X32UGxj28=Id z<>bK5pX7p6ud}lg>Ne11ZUo|xpf8@f>z50JI&Mt_Bc$eRWQlsCfZ;%tS3>YeUKWa@6-Pre%HyYK-&UMVW%;d@Mktzm$RaN&#G#Ar^}JfS|unz)#l!}in#FmGUiOn{0oU^)&>@XggI7zorDj^JS^C@6+Kk8g=aIM(3o zP411o)I0X`E#GxgcsM-P$oqL^+*)IbA)EWlaR0yne)TCCm#v|;cEZnIET}7Y)c?%XF19Gc$l4twzgRr>9fb zG&>}=YY%(R#+b`~GV%-r`yOp~p=V|1byMjw9Svm5K*=$=(w_;W5uuHdayfTl%qWIx z3xr^paHw@ZvEA8>F7T$4`g*3@QAy7BOHe7a2x?VrQBnVjioHCg%(?k_aQv~S;}8Kl zF^jVE-WZ~KGe9K*W5sSRE=qs&6RGNPNuPy&B7>YPupyqeVt)KjtMyTL1~`*)EFyQugAq;)tdxvz+z=BRhL+_abyeBUEobyuZ7>4o!a|aQk0G*M(r% zL#JNPH!EU?j@C6Ib?a{c;kLoVpG&!=Qz5-~MRm$~43pOYRKWb&XMlO&H_Bw8*OBFf zEz|;-kb^KVSXo)qxry#)*xB=N$&C{?PA59Jt?v`Yhs0(NWIz>mdUDc}%%g$(fY1Va zVg&7uVBuLGl68hM8`8|#QlhnFgfz;FI;u9@IefGaSF~7p;Q^xm8H7sWuu71mj%qvq zXSl-$sl^2ar;t|wBz!gQAvO~gBg1$@o>uXZP4q8)Va$>%5oL>U1rq9kMS`8n5$}n> ze7H6+Zz~X^AIT>vezCgE!tTKS9wsZV3;%Fiw;X!6+1vJk!bVJEaP%s6h*+EWu0tWI z&FP=YO<$j^_RU%W_rzn!k$_<$dQPFeYO_6Il_atdTkFblChL$;q=YE<47 ztKJ?Q&%i;7z0oZgwEB6gspCp+sCH-l_xIt=p`rb{x}xKE982+govb9J?0Ml@m{4s3 zl!pEaGDP(Ily3_Z>Ut@lKqjJl`0ybuEiFI4{-`>3%y?PxycWNJ`2JN0XccW>Qy8P3(JSWfU^s zj{7KN2*V(W5=_qD5$rze3lYtLhR>&GCXO&8I;Moe zg&baEikF#_*#GzbLeAtRkBGrQdi=lNZxcvVDVSn4 zSvNE}NycQX)&oKDEh&<-%E!&Rt@^wi#hVSHJ$gF82qm%(jUVkz2PQ8qbFDswRlhEB zB`lk=lj`6}`u$8pPk)oxd8souF6GXZ^U>d1ug^ZNy{*+2=b&2Jd{3X(zf$;Vbit>o z$qR7K+xn9Sna7PK>Zx@9`K32d%S1rA3b8SHjm~`TpR@*@)%7ZA_^sQI@~`9K#5yiy z9@;{uwG~VZpV{qo-S$Kpx>UBaIGJA-H_(oEoq0)FcXxq|KFOr8ceU}aE#;K&=t2un zMR>x%!^9s3BOOuC$jAtoJ;+;W+>ylpx6oaE=#2Yvw#g-#b27d?R%-eD{GCuZv3Sb& zT6X3f5>B(B$%6(h`-&4KnxPN5MXQ}@I;VQ{R>mB)PhnaWTvwS)3>7HdH*elxU|;|> zBPin!4TjbIA3uZ(P)`arnnr2$P_5faL0e-#7@*O;?w;)B)WE3HW8@XSkE!urWlvK| zolmrS+wrIV)uqel=I69{{C|GE1Faw~;~QI!lX4iu1ttgzmMacZdL%Q@0-)s`_Wg{~ zQUwPJ1H1>!53VHo+zJN2<*T?p+o^upo?%h#xbTRS|McFvfMd*u{WoX*myQ#7TZw~S z5+0*k^*+&ezjFn`bGAl!bkd!GUQ&LGp zhDb7t%taDO84Dqq=XoAufA60C4!?cu{l~tJ_jupu>6QDsuWPMyo$EY5-|sqMzqXY6 z<`89PduAv-&x7Ei3J35!eS?EY@~$t8YNwK<>?Ecq@ogKq9@3l0&_Mzm--&1L! z%a?o}&%|x+naD#`IwJ}CnM}nB+W+9}m1)I-3c*lN+;cj-|4i`DZsCTU_Yz@>6=EbY zEMh+ynk9pDFB(}_jSK#HTsXSDH^S9CK*{lN%*wp``Q6dKb>>4`6E&AZp3@!Jm`l!* z*DHBYH7eYfyw7!9wxa9OWv|!N&M|HMA#dKiK_&CXjT`%h{#sB7cX%&i?qHV#vMNkp zxwbsFj-{;GEh=I!%Ymp$%PzCg*Bq;Qdi=&xz2C!>vgBPXG!Lyt=>E)1J~Q2Q4dp%5JT{CO9m=3|RieOHC6Qby9=^Bvw#R{*lg_<8R3>|9LCu=o zyZT*mB!2d1pOvxKkqgTu>szUB+t}KO{K$(w=Fyy4^x#$0Ii0oJ`)Go*_yu46H#f93 z?7z67vL#X=xKo=_?ukoFSF417w?w|mr)R0yIk#i_vYLSsNhjHrdalstF)?Vo@j4i` z6Ws-NUH$z|-0r7fYU1*l|A6BWC(b$in%_>Qr#%~}ITGh)aDa!GXZzh_k>AM9uqGyZ zvR#d76}+=uQ_&{eM8;E^Tq$7LQGEGw*Is{m$sBjxx^+!?c6qvs5}qASHKc#qX^%7B zuRnZc`DJ8rK0V2y8kz9c)U;j(YXgb)w^f(Jlz=s&qEq5NcbV3L_^@aWQS{Ks#=;a>06U%bz` zgDB3sOgp(v&og~H8oa$UQmMObNKJ}SMUJm|ASe9~y+F<=y#$r?z-@ozD}4j5IGFBa z*|PUgywAadP3jVMF;kxn9+nE0RPYb2BnjRc`t-r9@Yd8VPsN&6#d{aOeA%({3+vEW z!t$T9*{MNiSQVRoG;-v*Xa+sY4WkU4>;2-r`*&yWB`MVu`6nz&ZA}LIGlFyS^XbR+ zsi>>twskgQqkMr5GBS?hhxIiTcN|_HUndj2+5gCCdM+=1TKXa-efrd6(OkiUbF*Cx z6i$|D>&kH{@0X8QZ7wf-<7HqI9jMRUUNc^p+u7BDXmN{IVciL+N zizZ2K2o|ud{8`xf$Yc+Lh0LDaX9xCu(-YJw3U4gtLII6Iz<7Ga$a)9UtkvyLw-!dj zQ8IY^n5NMW&TH&?GYj9T6;D{{3yXibZ7zIeG2UUUg!|7$#nE`q6oDOgttniTOw(B>MXtDUNz288qKQLkW+6PiO>siklWXP;=O>MUH`zC^5*)1OQKyWQ5-?r zDy&SyPTTXc{=T32J#OYsN5lx}vno*qDtx)57afxdtgI$ZKP%3uHd}woPnVNCwDITG z#l(UiQTJOoVN!93qUqUg&BCGj9R*V@XqSEePFvP#P3Sfq{gDEPj$s}NUz6+3r3X4a z@~UTdYq;%JZ-~0SSQ7G{rPL-j(N*puf5bU1|9kiHUE~h0>5YO~o zY=~p>ZKQv}uPF%<>F8{yovYiSR(^dVLob(QY@<}9$ESe#L0Zz!r9IoeY^+b0?{Z$@ zilg0ll@Zdi?FbK#j&>g7NA``=`_$8)sj?B2QQi)Nq&?H53@=Dl*UZA=1n?#p0tA^m zqBz{+c-P=mMtdOqZ=Quv0(hP;H6oO~HZs5F ztem6^$mAaCJk8kBa4f^yXg9NvHMH8j>EspJik>>Mj^hVYN1cnx+qKl$wzi1i6B(Jb z+*=mN0(Ulfy6B|V>$w}*K!B?=vKf6r2B?4hO26X??K6(dczc)2*7ose)4 zWW+E*WERQhfA4rhwUuZOHxd;jLBum4at#=V!tiSmu7&tqpTGUeaDzEjcQ|sJ-mjuRHNIcnfR( zDvwK4deoQfwp)ooH)?C2pBZa3^DXE#9LdE{@@W7HhANIOWTuNrH`2;R(*hE>gC1Yf z+C{b&!~ItO`ueE4Nnh3)pPx7>ZsP9dZE=U!a~fRUtD7`)yu8;e_Nb;xiFVn{S4Gy> z2c}QwWu74US~VV6&6mMvbq2v!Mlo}8<{00=WR&s8jRV#<)5F0AJ#~h~a%;@cPR}Q9XTuoW7>{^Foj_0?KYS)w_zK3=YU%1_qT%16tb5lcyE;(nWhKhks10D7<)8L*p^>T`RJ3DR25! zzJ~epmb|q(PScKaRy4^kRrG@PK z-@R9J`6F=hOIGjnv0Z)dJ7u0PE@VRIpg^Oumh!G=qcF>;Q2j;I4VRy2vY{Oy!PHe_ zDhNh#nrAz8e*LKpzTqgv`;3obVpNw?Nk~)92e?rYQ*df1!0z77lEz zXO2z$aO=H8p+}ZqPacIv#JpfBjqvm0Ya5Oh+7jF4s7U*=_no}0K_0@=Ns^&f$gp6) zwqIvMOeEvkQu)i-;q2G10mx*akxZj6uPHX^PVnId+KNV_(u4if9W48%$oq16WlokA zT?wMzoy7dDaC+lw)$3LNvkTQi$t2=k6HihlMExE#9F95SaFS18vu2N{<)vDWP4`G` zo%&0<{h!E=GQUnZafCZM*o<+hIo52JTfw`g)Zan+&5iNd@oI0z_cq)rx6DY-3T~C` zs}|dRyw-A6rg19uXS!O})TVf?vD%$q)-q6u4;)|(ddVssQ8qqrTdL)yFs>-MjY%`^ z!zbUU_Qyj*-fy2KM;|RU`u5p?C4?O{_UVNM`#;R(#hH{{2_!>DWaGAJn4=>DAcwmC z{a~uF`{mOrTJFN0yjD}fN{O>=yE8AzwEYO>5#^fxY9lcIQ=6-wOyK1$KhLg>8%cqO zb#93wtg0B?8u(_uwyyy<%l&YHxCb%>vLS1FWtZqqWx-b(n^@uXXmVol{%ZG zA>*~K>~bI^aFr_i!K0eQ3RJ_9xfvRDCT%1g+qEQ3By)cL)bsdE8ZQOK6ejT}an}Yf z?JwTH5# zbd+Y>rO?>W(12E1pBNW$Lcwaxo;9D5g=M0@e;zGP(0Ma-WW^6qd0wXM52j(HY7NyX=w#RH}N6ynyEBCd}{qJu%lQDN)Ve5v^K#OFeFw=h03z-p)Ir)W&Bj5@2(8u3R3J z7pitHs6$S?GhScG{g^Q}J3nvd*+#l`=*|{us?o8aO7jJ@CedL_NPm6Y_I)0ozQrk1 zuCzw0!Xm3zHeti82TbUlKCYM^)YjKeNlMDQ+F86<_A<;fon)X^)Hdo7^v$a%M5;4i+&Oe7unAAL&F;FatWTi>-!6PDjcTBv zzkhq7RPRx$d@8xm_xyWPO|l#3j&X|4=?6_LzUARM-H_JpA*fzcUA>G_J{p$ksi{15 zYdg}?{*;l&Gi40bi})oBzp?m?q;PYoY%?$+ws6^lRi&+|Cc8*vjYW*@9IWUJi(SxP z@QgTWcyFp2%W`p|$edL)(ZB1BxdLv8`@#Or3)QDD?9F4f_g?$3Vw&I#iUla`5?EU; z+uURYm;Fd28jMj;)7I6ElXT<1H}*=&jQEp}V`BcmTI|pKcQpW%y@e)26s=Z&{$2`N zI|$2h02TG7i5_g4V<%bnPDaho;;!shziV zaJP?8ZeCtz&ZV=mvLDa^cG`b^n9AsQ*};i2H~W~!1Uyh8!ho+WaIsJ}`Flrc@%Den zPs*;&@85IL8lD(q_L1KC@*=)qSXdaE;6MwEEhuATya_SU*9Jz%E;AopTG6?}(`% zHP@WJVxRKmR!eNn)wg9&=eVrZqcazJhv1;7g;$&$+(X2Eycn;W=D z7w_EWAHi%zb1#_w=tp~bW3T1<2h1)~t**%jP4;T%in%6bS=3#1{2rnH#^l!2*oeNP zG@}kAhy4Efv2h_R*5>G8F|egh`ceE{Ksi7r4s zuCOmKv#GrF_PXy}xx}5?%WyhHjO|^!-?rn|8e@OG@8m#1mlp<8C-7OIRRa0Zm}4we zeKVKqV?9ys1#Q{q`tbhpoYfI~ldV23ZD4#cOz_ifc8h>=KRIhMB?gxHsnXTIZ$Gtf zZLRsuQE!84jSj|)6KIH8eTxJ}>NM7J6x98@dlUUm`@n*H{_J0Ap65Bj(0~l^t=p6_ zmS*%b_cAbK85TSkz9xGD5eHRn)XzUwRDhKW8fNcQvn-_@Z|W?+Rk8k%NP(*%FI~ixNw9oP8Pk$5LMFk@Z6z?d`uyJq_jEw%GSN zieXU4qBoAG1|}9j`YA0=gaE<@&|g^5pyiEA)b6K`?yjx7fsf(RpdPvd|K1NJDWsOaeG?%BB$-flDt z6W5%bsYyp1g?cec;lSuO`drX~@Ie&)Zxra^+4jL_3%nD3lBv|wUcc6RYyAjLcyDdK zKMo74Y_Oz0IBV78PVZ9%P7u3qM8A{Z;Lv-;08HS0d>)LBUXJB<7>)+w4G1cfMN$L}>*8oW-&=9-fh_~`DYe)t z8vwBb=i$RsSZwgae^wXJz&KQV(B^`nVF`-ZsMp-Te;=qgi;Ncalv zIJDf9gM*`6j2$V*7En~+-_HK%1kQ#B48>9(5?FM1p*4}>BRUrrV9L` zs_)<5RsPBQ-aSNkBQc#D239RE~ksjf{-MM@xHe`~nyYkpqU0JOk$z zw1jYW!tu-B7)eO(5D6G^av)o@Mo$XmHuwk=3I`tnFnsj%DLwo@sj1CuZ6#p!j-R77 z($)Pma1C)+(6L(<@Dx}GNEpxpUm9x-9WJK28?de)DN2wPLpZ{i!%)TW_L%7S_TXjOAyEed4yh~JTIR=}{o`YVb0b(q zuRaI&0rQDQ;Tl^F!kWMDj3Bp1jER2D&HJnB^58+`Nn1PVyF zfGvfit%;c%yXJlj`#KCJeCdNqgS4lJ1zlw1?J?VWied~t+B>|2VmXi8|r-C<& zM|PB(o5}+RlF-2k?@nDiLTxc(7yV4{-Bfh!>OmZf@$$enGIfS^1Y9LgpG4?_@( zJ62P=IT4_N{-L|;J7f*e4K+XCjj<%AI!}<{7I3EoZCB}Zj!>M7$BeHES`#Z3s(*NB zDD;CPl-q;oMQ6E&VT9P-+T2V;KzVp})U|>>rdij9d;+`^q>q}Kh$FFJ^YELH({Ma8 zt$ZZb{mV+74mTbn?J4oVVk@CNk!0?k?8)wAriTRi0j%}gi4_-lJ8ZhIB4vih{C&qY zI?|q8VBid@$FSa9^PO%36>`?j?XEisL3U~D3apzOl3elAL;IPy!bZBgeK5-i?2V3- zxvZ?L%al0V<}u|p>f5BtQ8KrX@qnHceQ)~l?)KW!jKQh%2TNjS2h#_CQTA4_)S@%(@{**u5qRq;S;mA+ro1C1?ekkKlJn=9KD&_xKS;ZG+ zuJc^P4h!NOL>&TwX7?b>_q!#D%P%3p@76}T;}~XS?!#b;Vv2HieN~xE7)z(5Y;Ubt z$S@I*juZsVj?1u!AosAL;lhh5(s)$_V2F+ES7(SgOy$B@J>$ z88#P=%1uvNe4HSko1HyH#lgpyp0u-jzasmNqn<|L7^qL42)#ExeLAsi?oHC#>Z`RU z9BE!NCS8wP))>RS%+Ji2z0sRsqnh3AL#nB(%P_h|p6Nd(xn~x~cf{|u-!Oh4iOmav z(CF)GL?BEf^&tMsM}(QeA$=gQxIB0XgttDMm+LPu(J81uF@MMN^t6k3d?InjL?FBK z0&QQHy}e4u^;td}tXxaJvmn40n7PemD=`34(a@Z7*-r2vBMt&~gTK$po?_=t+M!;3 zQryEsD!tmT51ackT<9)Y_UC<5Ix5Df`E7Gm7tI#?R{==VamR};E}MLL zx=NeoPlZ$id*7F3o3^)bxRz-$kk}1Wu9+M-;#eQIH?v+*NonUn>n(&{`O#yaQe-B^ zOuoEV3OUY|t9gjEe}qRZxq-cV8}51Qj=bHc(W@!QF)uFbK$=^DSd)$f&ho(1GG zjXtE)Gc$pqp@)li95h?#S~{1$ZXtf};SZr^Ws0eV7dD56COn%g#?O$-iFxd*xZ$w0 zC;342QdU5_nEi}){@8Ecwa+&36vNzHVJg#@V?Nx|XK87V7ny9<-1^gPCK7sC__B)t z&A%#f5?*haW>*ZE`?AHmq-w6^oL_dIa=(;wvj3O)iSqT+oeBb~U1_K8YLnci?62?r zW%cabATUd`JqWUokDM&IHsSXqL4q<(D7@(EBCk zsneIZ>G#DsUf7)_lOH90apYxIdVF8@^3(L8-vQ*_AsTYjqz9IasISk3KC4>Mbvs0n zkUk_hr4`d5WXMxC{gaWY9fv(}Z4`}Pu$9W{UiHvyFBr(Z_WN9qNT@I69S;&hG2i1a zqhc+i4&HI|Q;#ibjg(rPjGdcsY>EP~%G8g#=pI%_iI@Y7wAtiY_q5S@m5guJ^8fkvQS8aeie zk{Ab;tDrr_(HSG3Q?x)GJ6D?Qv3~04xd^=9m%@PX=?DkM%W(bbt9AADStCET_*X!* z!u681fsPng2rIc6C|UxL-7Vvr@EQnyO3D#+P8CtaulNq4@;W#7bfbg2zvXdvs5(iX za&#hG`Ak;V*N2|t^8Ujc`gv}Co+CS!pOP2}iANi46jQ~@AH*VEUSJlnl^9h{B3VGD zLBVRXE!Z|NQ#q$6RtBp-!A206cln{CK?x91a2D_XQia?}>Y{$3^jLpCDfL@|iyYQ9 z%i~N6ji}`;4maKzzV^ZGZ@9$H4yPMEhS4o>def%(ogc-O$Ot}JuwTL&j>R1RCoVK7 zA%cJ(8mS!i3#hWA7=k){Ts;_ZTzHJyIy#C}^7{G^+YtYxx*F;zWExnxaUuPHRd;#Z zktdUpvPbposNitIgVO)d51RJL<*Z$@3|fYU(MYPBWkJMY&kv-h^$Hyw9i%Ws zwds>5un5!8KqVD70zHB6`|SWVnV~#GKJ3em8U>W+*xr1XiMpYbii- zkX$7vTiM#SV-tXc3b-XKs)a6ck}=<2*C%k0_*b~k^eUK}_aXryH4+vO*ds@E{8>6Q zZ16Y4U(2;p!-l>5PI5OXva)owv|wQz-+e@5THXd4 znDJbi2e?rV>s6%D_;H}aP!*kIRYRu>l@yfqpmByg6x$JbZSEJy5ZF&acpL~dcR;{0 z@cyq~zuE{z<>f83-n(@9vf@G4k-Tsuhxqz?*X@+go%;sy4b@9HnPY~bs?H>A`^1!A zuSSsZKE?qh>q*9}Eib_4<>Fz3i{H^chQbRR@X=Tp!MK2BvAw-&cc1ML46Hc0xT;`Y zFE5WH7Q&P{pjiY+3~5Rg*lOF$u1-@H2nlk_<2cB`a6jcL-*6`UfZ;NP%iO|Z#RGox z8Sz_oGaY{!erOMeuD*U7cA!(8B@EDVMnphwApzQx?9MUlB$-1LU0h>)Z?Vx{eSe$X zBR?Ex=5WIvMhyCr5(RPri~_*UMF1W_xedFlB2mZl9YZn-P}{{O0d2ZL2;87sQ&U6X z#OnC7FfLUMZS5El`*z$hnl_DHyaAahQa9W$;t0|Fq*%fifB_^7i}(x-9wP|1Aeq7t!iBDYj){qyi|euvMy&iW zsthRY9TyV%m3Jiq#YgN{@KS^J@|wnIY9e39=D{1*I8YJ!`Bmmz`igx9Os+}4-VNM_ zLFgiWl64B(kvn9&(UgOB<_k(OJ9g{ZoXG5rKdYxgV53lGyGA27bYg5n{W5 z%(Ll;tv>W-j8&G$|B=kw0T_UQ3Ogbw>~TOv!r3rEe0}WEJfJkdE)!im;t3X>tMk~7 zQXAv09>|>W)^QHPMHHrCAl0Z4_|Va_1=;pN z>)XqLN4*z;;p6EsA2TPtXNJu;c5)Cflrc_r?0%%oVPet+*)arC>)62)ciwJOgEKee zTRCZntAsB*#XU1bD6q$x7cMLh#d*VjPT57FQyCRRyd|P_6wU#FoDyDmvM&p0u^Ie^ znFM`NdvmjE&qqHXIEY6LmX-Ip(EZ<3m~Q!@#vb{vG6~niZ#De@zsh;n^TKK@0RG z>YU7x_>2rs_!od9M|}DKeI~Y5=KMbYsVOx~zQQVN3ZOwpYCe6D#wyfT0L#~L6>a&r)aaoB3N

7#ZpKCTjOd=VMtb z#4AeJ$V0t6a1FX3-vn;;a2HIcA@5uN-pIOWn3?rdKRR@Tt_Cj)2~Hrt(tX>nXRaK( z<`69X*weVx3TeXo8#4*rKQrszwx6FQA0r@5(SXr`EjGCnzRC(=17&R|v=lFY{Bm4c z($;Sa6ZTmzhn@YJ$1C+_GR~?mFo(4q9T(`?lZ*2INpT^ed9-p+apx?iKk#9l@8RYB zlvXGM;gLbJ{kY2#3F))+2NrLx)@sA#iSk_^ItVjqb!zrk^Qm~>6<`0(qFwgkv~+RP zyr=Y~nkN`;b{+W;%v&5R-iDmAY1V?$XE5Ec027>FLZZF8I$+q~(+=L?Ut+Fi^YMq= zLeDimeZ?OPsts*K836$Y)adBx>G8e_L+k%jNRstMD{->L)O%&*n>h_S~gD3B&cgq%z|Tt z((@&pD@(U{-(@&gb_aCu-H;XF7dN?^9^^SJ;BRgxvhOt2WNxj_?4eEfdL=5-w43kj z+jAmLeB{r_ei}RRDT4`r2|}Q)TeqT)&zm0&V1b}#X7(Z{XBJJKKz@14Wj{GuI^|K; zjoRv8qmG4qz88J-(;Up0z>f9mxMu7}4>iD!O4g zIoD)m35|XMyHYEE=5^JJHFn9XxXrp&n1p7%dBLviIvzf~)wn?NUXWrU@%~Z5KsEeO z@l*O94{64r1ayFXckd$vnBe3ZEMBPoBiY0TJ|MKDh-axJNqT(H_l?G4Z;@D_ta^Mdpw*N&5QdL)H=QXrD1NdLM zt<(ErduslnMUZ+ktGX0^7ItKGgp4 z-ht{;ZDzO&co#$*X!M_O@0?u`9RD}?=Bk_bA~uIRD1lq0a;j=*S$?r|w`S|T(p38% zPSrD&HE}WPUpf^{w9lU}vXW_4@ct}x(m6e(@skY}a`+IPS5tHNa~!8O$&qIKl*KNL z2Kys61WoF-U*bos>R)9a)9HQYoKhR-t~X~Fx|SWzMEmr-2CbX>wcin=%YL^lm^Wh2 z?t6dZ{KA7r{LzWh!ZvI8Se`yFrW58fyDJsUY&bdf-MmB|Y_>JGZC>5p@}TjP9lCwc zX+Yj9UG6Hh1?U}fwq82af975h=9O{Vk4}loMtoD1^cG1OnF{-vESdJozQ*FY#IX`J zhG*ihMpdP6cyZ>$z2TlOIm8@&Bs?a^>I<8i_RxOm#l+KU?(V8ry6(vF-n3#dTpdds z6;Y&>p$Qa%lS>^Nr~fTty_J18asfR)`U6&6%_nzV*6^TQ?`!q{=;?ewtT?lANc_lJ zYaHJ-gCg74@0NqFF#Q)GXITE3lFYft54bU8 zTiiLHlz5%?mrHT`iI7?DrJkIrx=n$1qjTg@N5a=;)h(stY%jHzm2a`=m(C}7@^E(8 zU+rhceMKr9Xoej9jb3eB|7p6Ojjpw|wbP_Wsmjn`?|SEMVQsB^Ft8-Ttg`id*NNDX zj?hygbI*kJ$assqRuygT*qpgK`p17IQc)%?>wU0mvu@yB;BhtQC1%#EvVPwQX&y-7 zpgH3v9u<0k;-i3ySpKz~_m7I-?J`o&UFx>Eq9((;DZ-JwtTOz-{c66a%ah@p)SG`g z=~?dcta}+0E)G-@_85BZl~C2Gwfs5$UR|NKij`Z>Ao`(vH9T4+1wmy^P>+G>7b93j*Ej?i}l_R*Sxql1fj&*^)}2Y;Dmu0u>#9Q z5x#wPm))ZuQs1={>vVW|QiI7U>e;6K)11Le?S`yH$+2~2)8e~sO}Sr#73t)ge+sDT}{6k{kC~NXs0~`Pf=q zw~KFI?%{*EZ*#agEmJQ4xG5Up^I9M*iQ;fyfdBXVp7p9NzpJ^^s?{XK6F0hNPc!{` z)g0z>bL8Q2(uj@1`ruiG?5LnK{P${(RC_KhvhXCchaOz7ncd+PoJg%Oy@pm~>F`z5 z9pa-?Gc$GI^tEK-=S z>i1-DcV2St=CL}hz^wY^gHvPUpWGkDEPsNJ+}LDWyW#p(U`g6`jdeM0(~(3W>RhO_ zg}3y{*1~ts2K{+>cps^IImuC%)oSds;qn|^Yos`|eEfB0=GD0x$}t3zjj`f?VZF^Q zEj3>PCEKIcx{92qs+LuMEdPqUAiUHTCe(OutR#E2_~tITBU-j6YJ;2Vy?n;q*vt-J zEHMgb^jCG6m|3`~RblgTuV47JzB);1U-sjZ$+J^@4I3iQ?bw_TT{E%`>JAJKW89cN zr0C}8s&T3`q#JZ$8?7BTUCj_pw7}kQU7dt@J$uy&H&5;9Esq}u9ZyMWq&S`Y{fp(b zF;AR5jO&~o=Yhf{ePlLw<1cWZG@iSnt8g#eanR#t&x%dzuX#oLR;fv`Dfd#)l`ptw4D3?&{ZfFitCG`?wJj9sjMZ|t1r@c&T@ub zpyS+^6=Y_CmI$LSyRxT^&0e-}O>LV(-(A`l>^2TSHtEIx#!tiM{TlP5H)?FNWgUJW zc$Y7HI-C+6wJ2-I93tp}CeDy%iqV**5Nc#ivc|@x*r}!$q_2oB)7ZGX{8=hrSvV&1 z>|v1u|8sgi$Ac+Xt~^ptPt8d++T(sa%7BMM%Y87fm@9Do-MflJ>wnik6UMxIRJ`{3 zhng`?O>FcCk*=JSM#lf>ZnGDUgP8Yn-cIj5X{oRK%{Ww9R5d`MeS03`qWqjeZR5)H zB)!0x%^A*waj}}GExp!vu~{h>M6^@hMMjdvZMX$_2QxP>ulMS_%5yiqJ$p)D)J*UD zWgUCIs=U6{_v4zQxL$VY+27wj27LSKZPJkGZy#0Bzg^;G)Td#q-4UEK##8(FyH0Nt zR7U}xpdv4)b;az)QQse&QunjucPD2)m-(d^e*P2{X}Q=qnOgi0ISZvF*)iHl1y4ic zd*LzB~~< zuJcEd-gIm{vsm9~c*jO5b@R`o%08{qV9BB%YXzeG9bVTTou+EL{UCpK{dsW79;q6B zt2yz#4v7you8ofra0M+!u8)a`=Jh)BGVBlfk};V5wd>^1Tt~H2&O`UKlcl^XG@V#a zC{OFPd!z_OH~Y7?KOsA65>l2GDCM0mKECGfYw7K^Z2Fm^nd^K0 zUxd3+^%w>fau-a^Q!*nN&GC;=NDd4?v&p3oISw?*%#z>EA0Sz$@HIf&waZ1S|V zv?Rolf^Ka>AkBcu2H!IaGMFb7b3#Lesr|bLfH8PslS~<_}Ly+CYxJwa+ zUeOUwV$ZQcA2lhxxQjFI_F7L}`vF^*&c-XgtVs-7ExBX7v)?zRa=&y&J~&9xdFW+% zP0DDpchREIjC6RTOM(62H_llb)=5&9n*E=-zN$q3dSXId^2J!a>aG(<&uLdz^_n(6 zomrQ;O1Geu3C0jd@UpOKNYlx%t*^AtRI@h%hmsvbi;=;DR=jA;`TDP454{bldA-n* zV;Qpj_Oo=wxSqk-XRlOk8?08?90Y_CqO)k4M-3K(jZV|KygM~pW_Y+`{#AN%ncny6 z)!>i3Wgai3ZBEPVXuC0TCsjmoYzGngSlslUvWy$}Z(dg$#a2+@I7IZ6>F#kWL$Zbo z?eRte(c@7c1@DKwYdqC+CGL{UpUTvtqk3XIW#egb$7a2?)-pc463y@~)QUSNGQC+) zzK3kmyNoja@lC;xHHIu9H{gSZ?}shtm7#^0@mv-k=Qy~0+1lChff!d|QuOBHF2eV}xFlFpq#uq~&9hRNguPG4sYeWl5SbhQBtQw*X8)svPt3Pza z(17W2N3lla_X;ve@rJU?O*N=+e8o;OrKC(RE>(D~xLa%A`EP29bme<^7*vKVH*#bc)D1Ud+gP4`N#+93;!IzTfS#)g8ykUft03GKSFp5 zTB{!w@Kf(`1R1Q)fTo~9W>$OhX<$RcfnrN9g?8_$c8Sw+)!!?i&}pt(J3m|=B)UY9UsUWG?piQdiV2iME@3b*Fthb0I2qt^TZi zNIH0qgJc0!>H~yx&5)Y9ya#%^hr7uDGz>c{h9W4)s?+(_D#@Pp7Y#FyVKV8 zY;4?Sqe1M|{N?M%$`%?w1y`D@K0B_UuU~XxEK$;(|9+M);{|_6nhOSP!npqwoE#D=GPsEc^F|L{ z2^}ph;7%jo#2T!&SSt>#@!d0n3kCZlWh{9glQ`Z5tYQriCr>Xfz6t0Gjv+B!PVJZ3 zc5dHpGw`dPa$X;cYG)z9`Zq$xz8*-iZ%sm4-5-4 zCKSXqiV2?~1{<|F?+x?{zJZ2H__}h=GTC3{i?}KBOc-n)0fiV6l2=f`F!p}t;E(SD zguB*O8+MZDgk*svLll8;Q~8y1><2-YXEJZI<>Jykc>AyxNcI4?c147-ViiS zEGjJI;N*nwHtNq<410z7J}^YrV(rR<#UIRvP>K^077olDK7qNtdb4NnMm%t$>YqQw z`1$KPJ3Zl?L-gc-qbILRS{fD}E{p@GQPxLv9)<6;7EkzaQ0ak3${!~+(UNJ%Af6-% zRPo1;d6b{!TesqvINi=03I`gKxFEOI!gI1!C0A0Hp! z7O?6&iKaLjHbTr96A^)Ac;4(Z&{cqptVADd2n`(YbD1m`d9?*$)4ubBs3=+-2+W^h z6;@kY8+a5X)fOsrslq>e_yFHDY{hZ+Ybz@-E(5mt1P5FLLOXh=-0}K#FBmQpTjX(N zoMeA%XkZ{Ezcn__pml{}BpL{CSH|8KZLGcZ871M7k&Ixa-Q3)Sh4auTBBHA?;f95G zh~+hjr!pO8+(EFKMtcsXdqM!<&>95(jPkKLuWYe`k`fvSyV#^JDk^dvJ2nHD53wGM zDJkMUK@=D$Tu#wBCi>|D(L}3@NNB^e$;Ie#YH8^P>XSHs5MjnnL=tq}$&(jpmKv`U ztLD=1Lj*Z^5BL{VfMOy}#^5`AcnDt%Ren_LAQ6zdKCagqs5HEa?ihkLygAEogb6xc zz;r>;?PimH{`w|*Shzm9D?MK$8sg!MRU+*xh6B_G2KGW5!^YMH4%OS+`}FAwq7RUz zC6$puYh~XI%ENF{%goARV`MC}{PGyJ9vr{iSXt>e-1Lcvb5VT+eE>QVb&|QhP?=Z> z*BVG3TUr*0&ix%7E=Fwk%s?>!;>3V*adt-AR}y_T;xdez&B#!7ah;%{p+WtMYCrj1 zbW0nsHWU4UF)Njnlq4l2h{+CkJM<6Hql3jAN3q1mmq|%P`xA^$yiur#KwaY$`+(r! zxtW;*j^E|TyCJHj9<`;2+t$rX6WBjI%srfoLO%ol#h(CQaX2>YrN##bMbNS}G&DT^ z?AM~~V@OLz@w-6I<0sLprhaEwZ7R6BG^OTZV-)bWKW8HEN@&>eFM{i6LZdat?{hf8 zg9U(76v$=bAp&Sirlq8WV5Gpw57NH(Hx~{TpZf);NCfhG=!L>A`t#?x8z=GP9UL5h zWMg`q9ZFe_R{01C5UwzQ5ny6^1|Is!6D@Uh zxx~iwjErqkZrvYlqd_*|YhRF_j$S9Gxj6a@$BwCIZ)-zQADut6zK|3={q^kj`NcNM z4@LDi|6(zUyU3Hau)jdQhNDnv@O2={LIWDNNGb;=0!FR)bs=R$@1XA^y+;Bo%{?Ix zz$bv}gGm}LXb|G#339|)jejdoH3Nlx~29yJI3fK(ekv-^(Bq;Mz`p?0I2TuYF z1umee_-9xJAOq7%ZD9a&W3m~K5Dl6lK3+CS>hpD!9zee zhk^6s2bfnn>__P6@DI2={+xDlI_Ar6dAto$vL8R*)-xbPESFCFw%@)a%}5|9Y4J3J zHUK!;Lyln#!2kG(6KJ)U=6}R*15FAHMd!g!I9AH5KdHT3aP1~GPzjjWXwl-F6TlV# z6*93U&dMUrHc-@G3_Lj{{%IA6v8q5p;*&`PUj3`vjm^xU5EL1|=!$S@H~@1VF1@f^WtN|14v2RXF7AVZHL@X^Ev4`E4_ zAhK2L92|FtX$Z^TAWaG<4ZK{wW{TfHN)sqQExm2eo=~)3p&uj=_Ct&l=sfd3WeEfy zdBkA?VVUwyxw5kI2PfHKu#wX-cug>6*iTOPA54(&mKL{9gq>s)%88jHaXF(TMq%)p z#5=Pl`TO}>xa)%9I*$Pe#u{a4BM&;bbGy3 zjN*`Mu>0!R{fFlF(fRgVJhx{(o$x;hGWIsO_=@j8pSZ^WuCoRPG#a6FAO1?136C?d zE?`ICAf|wc2Si%cRORu5a?|dTLWTbzUcy*zEsw8^nG{`|#8=&(PA+utV%);gl0St3 zyBEG(G2CLOS3Jt&)ahIu9R-DiINlzdX8GU4#}Xk(UboQ|2SGRk#^a!Y{P_5@@u49l z7lo!>NDPKwU9OGdTn4=aw*OE*<46ngX}Su*{k%8=8wb$TA}4a%PfKWcg^3HkfHibz zbd;Hu^}LJtUnJQ_2;MZ#uCA4W<70P@4of(C#VV)+2%!5tMoM6Sc@lcn2#g?N@RBe~ zw{6=7d<oeqR&GFx1ty?NR)Kba`iH^DnIr>1RNiJF&K)oD z5C5HbvQe{Eh^mQL%MKmFeXqz#OxAsXAYU#tN@)`t)A6)OT0HaQf&I9DAM z4n|UP54k>H3v^yMabjD>CKezpp9r34t^W#-kT1G_V=@(0)%{Ar-Tv70F;7!>E3&xI> zZWWYI;KGY54`&RK%HhYDSXsT${|8-~tH+1+5Q7B?xobs;6}tPCMfGrp$Dblz%JyRa zu#KGDq}U}TI~$M395n~*onWFK-%s{23I}XOM`v9u$^lh|_=K@UJVFoI57K?2Sk%LV zV|cqRl{C)t=E|1@9}M*$N`Qpz*tR$hetU*27=%BV$D`OgfR?~vL-MVDhhY{Ae^TT< zk1Bdj-ke8H_V<7b$Qi*yVD?MdG;!!^^cOXSaIF53qRqWxU=2rvK>iRS7X~mA3WMF0lwM1dF*y1Fj+aR6f~Q_- zs^E9XZgojn$N4<%*yzFP^%e?aSZUGJmUbO|g55`>tLt7NW_$33m@81BVKSpL%O+^* zBi7|I>g!ksVOY=`y>t*HdTmaBCJ@h@xSht+ut%327F+Sa0b$wwsGKE(0k91HU|$Ua6}>w5p{j&-|ydVXJ%Gzc^usDIU<5L2yg1I zO-;vPO$i!~=<-Ik`yj-GvM(+mK&_89^Fli(Adm&oC=~Cp!wJk5f{gIt`zyiqsX~Oq z@kia0lhP19V(9YgWyhlmuy+r3XmC+PE`{_ApG%zY^1Ac?mu%pn4niG3hQgZ*JYipU zZQuSV-hPedb6eXAHi5*Oo|}V0gol?G5=V8c%d97FT%eNM=_7^%UIKaw&hDxJ6bM%i zJ9BfSc3aI@%-MN)OF#u;1VM6n0}@AkD|uZGntw$W507`s4(d8OgOih`SdCD5=!E$T zu9pQQyRjjVfH^i2`L;H9Y?ZNG3(~~m6abajErBe-E}Jfj@Ad^cBg{xd z95$(&NQCEc$^><1HF*7O&U1GE2#Kb5GCX9*mmyXRK$k-GTe87K-dP?%eFzBKMMFbL zP;8jc)JBxjre5?envC~Q=K-=l7%HN{0wLD!Rco}h4IaS!eGSVsx%>B(D;zB zh8Ez&6*%fbKueT?Lt<=!ung;LRH?z5jYD_%$-@mROnmN*lfUsEPoFvkI`+8p;JJja z<$HAI_(d$2a#BvOjdC}p9EU3p%d2&|$O}zLUGE(pA^*o-8d5HZgs@1Sa}kd@?ebwO z!5*Us#lpf}aJa#KkNeOe=e0%qi)7;xg#!6)7mnPd^(u?ex%kygw^U7RxvN^$trU4V-yC$W zTZu6&$Q~u8dDucv6(EeyNjQIWryn(b%5TKO$%%s&?zyoW$(G5Ogv-8>UJvi=ztj@# z&8PLsn5Sp`X3ECQRI$9gSxerT1O5S=l>WIe5QWQ~RMd;HVzK(1Haq5!s_%x~tNO3^ zwIOf7I7iCyIb@%|8{f}EQFiBT@}0w*I%J~*=sg88L=1d3kzuE030l2 zGb&g5qd(s23VpR{$6->lUc4B%)y^#~4P8|krff*n6*!^a@0G~cTnLHlmCmaH;>2i{k}&ZQjuArWKJR}iO3k4Q&d7kp$w^z zjAh6?6*8nUt0;4XjF~DTM41U?p6A(FPw(^n{?7U1ocr~8y*|B*XSna{y7t~{t-ZE< zH71Sy1hRlOh17Rd<31;%&A4iwg@q-d2XuVAuFqQHuHb4%f$ZAuu?ZSVVFH0GmD<2j z_-**XKhtFVva;7O6b*)Fq#MatYSp731(>hOQmTqe#7y@0|3D>+CUU!A;r9vy=lH3# zH^UB}b-R8pg|6SSqDC2OL=jNO3Kz>k5cNROH$Wvw~XH1i&^d~7DYaq@7!!d|f zRx4O|3k9oDaUNdWt*n})*tZlG<@m)T;c}`rF~4+l)Y8!zU^|=qX>CWy-rt#wDM5W<`LX3{|`o$J22Y^ zvn-3Zm9uak5b%r#LB8ERqOe-)*QEK;okDRd^Q~8we*IPJlREZM4H+xWPaRZ6k+VUIdjxL3N=Y;GO1f@L9zvgt zvzY|-{=mn2bTq)>2&uby!Nkmxu*&R7t;SeqWvex}u{u>kh_96soomGe8>5I$xb*ighcU ze%Q=RvCHMl(KZ?Ta{GtI6`ojCZ_d5MAOxZzFM2`J3cVN1xKg%ZS8T2IZUr%UI{%Lr zfH*e<)qAnYBeJ`&NJC=XnQyYvNn9fFG|Ovim@0u0P5Y)!a~j28e2aY}iPNlgjGdgu zYA!AR-{mg@8&P|t+@a#ZJ zFX6;6fd_)OgO39La0q9U91lZlx;`_;G{9cns}IY=(V&kobVcQ!wRkk3V{q;qmCD>9 zx>1r28Bk6_=k6`UglB1I2}LLU3#YPP8rXIy=U4( zrcYJ-_FS zv>!Sfoc`1BJMIX(>?6IC_7^YdMcFvBc_!JRaB1B{4Yn-a3?-p~6FuYmxROOr@Rivl zbZGB*UDkC=iEH$zOPi+5%gAv{d2W_8$E)3PR@|63XaPXoU~1>wYL0cRH%}D~&$n8) z>^_jkuxZX+H_F@`i>}S8l<1#gT?M%Kf zpWrR;qoHk`l5d)hboYD5OTAa4!NAd%DrWfA!GbSM0VcdxRNYGM4fZvx zK1!#mb>`UMi92(WO(LbqU%blo%nkwffPxf8CCL*1D3BZ52$Q{K!491D<*PX%dxx_& zZv(1CLJIZ0H`7Jm#{N!{&>dhzcoIX`SkLn3{yG1t`-f6Pq3%d<;_L;{-OWDxf?dCz zdE^cun2N@!wb4m?owJ_o?{AQjBK|!Acy_LriKec zzW01k>Id1?@crhdDo28!sF>y+=OYBw1z)+XaHS~u_Q0~gUIz==c?%__D@jo=7kF=Gh=UOR(-pLZNZ$jJRw1Q&c>W${z3$9SU}Y(#lTPdC2&d-D+GJ8n|>IX6Mc z6ql`&f#=5GCMmA31ccjO-;*rS_o^iVnjz#;r;@_k=>w|->ezHsKrmWiQg)yC1+ioe54ALv(E zxM;fn($>z_R8TtqM)it5{*Q?y`dvV+lEA?i3%nk@>QO*4tNX}>WN@sFzA-j727Y`@ z{+nt0KAmGF*|YaX6mdi<*mkfUI;hF~U}p4L+-^d0ZkFF;?%7?l>6a&XA9qhr_0vnt z-cY0cHqmgHU2sgbC3f=eAYfYh70zsX zmwz@{zV8Sgp}G(w;t(JquTjVtwP|wsuv%(z0)Z>ECSBc$C(Ow6^~(~U6pb0{iz$Jn z_YMSaBj_`IIar7NmOt*2#8bh#OM z88yYVE|22lU#Xa=OtRhLv*EzaXkngYg`YrkCE9#~Kmzd<=0|q<@!>dg=I%|uBvFITGl5&p zAzxP?igM<$6A0weBBFledIp!fSI(YMR1SX2Quj)rwP&cO^o8kw%!P;)>H6;Td6TM* z_2WSy@y7Cvo^<(T{D!FkFx6BfI?-kYpEcTS@5et1ha&YZzJ29-%_LG%LiLUDjRj)u8vF z<&89r8$&}(S^8tI?B3f?Hapw=I6BqoI^tPU6xJVHL=h0=^=)gM=E#fwy9b)b*`hs) zNof+k5KPX=zx2GQd@R~ZzOKY}LAA2aI7p|QYb3fShdKDt?PRg08@Fz)!f=);u-bt( zYx4-1=n}eE03}$A42p@&JYwpl@M~O(HtFb3V|Y3e{jqFAuUAnxJHJ_8?ME-;^t8Ia zL`S6}hl|7U-(zIK$KMkO?kVT3ReluF#qXsW}izWYv)O&jFKLOu1c1KI1!tdRmRG)VY?$SXm+j3JdRnwr0izW0W<%?p5s(XwWeX8^>g-W zwC-it?;`&8#2ItJ&(}+*MS1Dh(p*+8=d|lLw!a|z5np!gxPER%L0^K>5B-^fyuOG* z&&?`+jZ;Op60aOnd>0=5**!au_6t*9%GbvCVi&Y@b;${S{{9xJj{s2w22%EJ{BOF? z@BnRn@M5cy+~sOr+N=LcnJr8E`U3TjrJ#`JD4UNp(X~tQ>5IG`0RO$^DX6G!r^+!8 zT{-#kT+)a2?QQJzrQb;9VlL_Gro4Xb?ZCKxT8o#xaI-8{XHc*(s$9Qs`26+1-B(0A z=AH#bMmDtU$*@qn_wWofCHl138z$7i|A%{gVN+s(?v zz-_T9+xEx)B=#wcm}(4+lj|HiM+Vpr&}QDkrM}iU#YXLuF8328$tY-OVnr>Pd|c!; z*X9=%P^E9SUUTbXBj-5`IT{-43*Z`rl`2&bTdm z0zpJKsn5(OCnkmmWTbNOc4s)VD@}9!6^y@3bkMP~Mu&xY8cAWP@hqlj9nKsIwr}!a zS9rCLG?Y8F1ELGG48OUKdcv#^|Q%z#bD(%m(^J{ z;fj+iQk}!s$Jh)#>=VU`TUuKKXQU|Q3e3wk)1aq=Ba_g8g}=(>Upe#hlBE@XDqGfl z_irSbb(=f2z1a71uIp%YOs9szBzKtI8}^UyS(b|gsqel&mKLG8&%(8A>cLy7spXNI zl@CBey>wzdmH$rT^T?jgBuj~5YGr`Ypn*t7S&@tuCIO*zU0bA{;=?9;)v6p!-q_)> zKI=o_`+Tr$^I_=1sOW*4oexs8LeE96sNd|K@2*e3wT)mds}}@@WC$*(KH7)KtDWBE%scR2%*3(+7bkZahmion=j%LnU&CG$c8) z&x1(`;s^bcbu`%C^q$cUIGow@<(`km>08yAl5utY;lE=jqkZO^FUG40j8WawbLPp* zvZ%={Txvb3&XJg^@LDC6<;jZ&LV14O9Wj?wr6q!1QgMsY`*%#~lJ~c}lWappjJXKB z>j@V5Qbe=BwLAb%?7JI`O8%Y#zV zCtn}64qa$FHzS*|dxX}7RZ7FfVLotKOYd?xQ?9(UeM#7cPn*vntGeJrZ}b~v=4%^= zCcjKYtkevdH9uLm88;D7Bt5t=oQ3j{kDuS}X6(NT;}lSGg8S)UuUls;owtRb%dC6g zWcRVZdppzTu=Q)3jx?XgxUT;+(%&qYS1kXl`Kd5_N5I94wJtFy4cksUin6z!n4y(D zsz?8BBl42v?S()5jNUy-&tm7}l2-fvJkp$;bITq7)vJHtn}{?`+)Uo6phoD$5wW`i zGb=mxq^r9=b-v#Q2VK2eO*u|M; zDIntcywE$U*E9}Un+v&Y7xv#dk#*tRXC-=-KTSqJP6szf_plY$=lsWSlkVE?(SZZ} zO=@cA`E3|NxxDO{P~Cj;)L8*a;aZj8Cjx&6v9u&f&lq*?H5Tf#t39?qO}DTWWJb`o zS2{^aaE~mb(w(|>GcEndIbH@jGA@=hf3tX0a@YrrYG+eV`31edII!o2U-ZK<=C=%v z_jM7z)YJm_*qC+_imeF#-jFonZS7gw+S*VDlXWJltlwhY7VPV;`qq+kUC3(~xV8J{a*My7 zSI4UYE$1Yi_UFE7Hy#K&>Fzx3a<3)lu!2>`sG|0cS}jns@#;!R0-Cc{1E1e<=2m@e zd*{>I7%-&Kn>Wj<&K*n>2qYS=KUHg5kMuqSq?qFN80YTZJEy|6wP?xbQLhyQ$Vgr+Y2wwA;DN zkNPj$JfR*K@SnM*5YU&FWOFTb-~4Qy@|qb{^MwQ40k)GO(q!@6=7myn)f-BeixTb~ zj=Rfp_|VVR4XulEcMNJ$1-{kYc&qe{gVDCp{y}98pdyHE9UhvPQw%sU5fC?m?!?ix z?1NinKCCQ%=)M!oXs08+qJ6ow<(FM|P1{4Iq&z7O($7cFNfg9pJve1uW4se=}$PEO;6csl-F z5=_ncJ+6zH+m==Xz-Xtf(q>iqEv*T0e%Jr^( z$_ZAZ9Su7@$+-CV%%M|ZyW6Cyp&=$J>Vp3Ob>qEzFOLLZ7}R0&n1Yev;JqTXV`~3t zPTWF!F05Y3gfH3+`y zUW#duo1pa6bPzMDAcHpqSQGM_BHA{y)#-CzsE@a3J$<8jBRqFCs#E+}$8YAn8wqLc zTt?Fmh05Qir4dmmn5{@SX;TvehygrZuyBEf2z}IOWw&=-zV_?$uTl!>s~eNA|7y2r zGnWQS6#aU^=F;}XHfW{4QAxS=(_A^(Er57P?!mJxQ}ynXN+-1iFJNO+Zhd7Rnca@x zHw{kuR_;4)Tm=q~Y~O{;|SDbJOKm9=iHj$#TY0Sx#tY0ew^ASI<;=H#X?dO3Uzb9~dJ zv$Br8Zg23DxVxX3C6wiZ_mC&W55Q0mon)0n{R<-|P?>=B#?&cq_wV~P#2xymJw+S~ zZLb*%$R4C_pUIPGc`T_@j(fo`BeMx#1t63IAF~3j{}&ZF3+bZZ(W4MB<#YC{Iz~3| z^3jO=w7m1;4rwecuQ)_1XtIYwEwSwSwHDVe@|pq?X|kOc!B*vi2R&f5}V7!8*heo4=}!<3h910%HqA~<^$174+|pn8j#&zir@Wv<+{f#J`d3_uRTW@ z-Aj81L@WMZ`16df3eS%1+d;9ftgae&+8z_+J}prC%t1Vf_!iLDz=OyV>IWDC{27>l zwr_%wmX0iFs2za~@-`zt*fZypm4|=~0=$IfKzUYY=z(1S=-T|uGW+{#q*5H}oM_9q zXlTezN7oo}=8iW-kC;18baL_#>;@sI=q-8RdpYg^vmlH5&DY(<5$&}K-<&_x2dST2 zJZD;g4^iIGa13@7XyXS@M#@rAUoUw4IDI8epA@BPDTqGIDsO;v3B>?psBL%@@a_GrR9?J!#Hxq|RwTGO zEeOAzc7>?Gf$zUD@59d78ysybv|>-RlSakY?fPnb6Imp3N<~g=zRwKH2>&4z_tmF4 z2RFLSX>Bnb1nbL}`+X}%3hbD#%LE%0PcePZ3E{j7XNDM;kQnTHOGU+MKwlnr1Mgox9P=IWG#UU=il~wI{{6d5D<_@pCP2*!52VdP zzZk8~ySBW1za;g$=a8peH`vAcx;iMfO5L`OM$x}8m+i&$j`Incj-zxvNLA@!P>7Qn z$A>lORQdgkv9!Mwfs%p1$76?B0XhZJy^q9*`H9^p%zFpijmC6vo<)s2upMUbwSF`; zWkccs+rCsmii%9xv!ewR#3`^(?)cJqeS{MuI9Jqzzgk+xJM%TfErnjsAKE&0<2~Ju zQL@^C?)=XmHcZ&hOvLAB;5Asp?p}p-12e9ngSftfen-&I6>r9lPrr6)f$&6R8L_o5F3dSM@1?RBN zuA*yL?06gl)G_*XdfFPTFIle~<@0 z%U2yDlY>$M+QSVH`~Xv=A1>2P`4v*~hT*Z8LNti5fXWem%U2r@RNyzfl7Ur&Bi@NFf|<^h_Sz8z*iXCqK!~oFB=JslKtGRKwf;?e8#T~E7nIedV0wJ$Fdv0-Y7pzOL2Vg*h z!ND8wL%}=@(4oICs%&a$X~A*16jEiZ2i$X5@n*RHQV8U9>?JU4o1C1)DZG2o57Ie^ zWbu07&_P0hYFk;JNkUDC&O2zIU?6j>?s}C0@L#+HEIwfoL?pA}Eu#@7L^2h>3=L(# zT;Wmjrv#{#P$)sYhFQe;kuGMKbikbl5FJ)JFC-l>-o>hQywh1aI(kc1Ru;NptRyJU zK(@4ZcD_`TTbJdj?JP`E%!7U^|S@28Ic)KI!Qb*jZ3L9H-su zz!$)K@F0ZN*I>p3KUoNViNcp}-zaHl^0KqbB_*1gBbpt!RK%g*0+hfT|9xq^-IMbx zw%zgZu=B~in7j?6H~L`B|gN|kvSmI@>kjlwkBLIvK?oP<)|P{hWJ??%c`ORY!5Wy(7TvHEPdxr zI(+N==#Tp*nnSCDq*Osc0UN}?AO_O3aQ;gmpx|9Q^W+H!f&k?Z;BMobc1q@>#b`%_ z?X1Dot7Ouw|JkYGtiq+Hwkxn&7?$OEdF)`Z@hstYOHM}bta}lDZwTno(X2S60D5e! zIp7Q>dWV6s!NJtlQ^Z($^n1?3PmZxMSZJ{KQ+mScZxl!8b!vwvMC3Dg#PFOsB`!|x zG!LB^v5)891@ODkTm(IBh9P&cryFIcLlbgG{y0{M~TRs#r`mh=-V!S0T`~lw+Hv~VA%}ylxSF8?J z2*rJI)Ia=HP{#7+P4AyL`$dqQfJJM7m6X5#aZ%A~Nmh1F@0Y3gYRc=-o9`l6l1{u%R5ro zLP`DEWJViibE@e)s#GwMyre|>(P{KJjQvbyMmk_jOQf86@sMD^rouxk$f9p>9R)mF zz3gmHbnmt&mwUK|5CB)^f8(1Ka12k=>>MrqjZY(7;xvb~2!GW>$eGT+r_(@yqeLCT zFpS3r=RSONtUO;zMJ_|c6r)ZqtGOdbV0KNZc;V#B?pZt1M~#}ke{mBq6#yZ!1oOG! z0iWvYAbPjKax1ZmozR%RS-So~`2wR{NTlo@&eA|3`H)7DmVbuxsTOFz_?bB((vK~66{aNa4(7}EB~tL`ut{)71h2I=HSIfjTUSZR2logb;FXn?9~Uw@ ztngmkJ+Va`M?^%l6#D7^E~49_|E}gBkuI^j2K^>tA|5Fy?|#R{YppgzSC;*mh>GjXlq*%mrayc55BPIft|}roj6= zkVeZ9PLI|zF!O7-vBbcKY_^$gx4uORg-kAk=m+x7PCw9ruc zo<2X;0uFWY<>e`=F>d+(M;L?Q?97YN!OF^&s!R;WP%Tsu0uqt%qZ!E1(XqhWdpnjj zoz=g+f`mr~(s?`!p>13?Rh?2qH^X>wg(=jY}qsi@F*!`+j8nogF;)xE;xAY=5F zp~Q>gI7lkDj>HXGW~q|x=zddZg`Bduq=e`$n_LnE?fBsy6ZpZNpsholL#zs5b5s5|5Ll+EU$H;8v+q9hy zZ9T26Vwf$B6a>vC*m?zD4ji3;0h7XE_UyY;0|O_JFGXg~?Obz;#7=$N5}6rRa|8Ex zDhr+*u+vNo2&jb|nBwshAEsx^Mh_^=-H^oMLuq%;`R2Ri6W7C0ZXQnPuo0@&c~8j5 zu*aUR6eyU|zCkSEQ_wVu2Z?DM$o;Hws!%+n+2K#ds0hJ9G`yx@bBM4#z(zy%L6s39 zSJMm~kuazdfNq}+y@A=fZQx^OK|Qa;;eNLsshjXd?m9HF_s8n{ z{m3Y%Pwn-33B&_p&&O!HTyHlZ?vhZ{wHuf7=`F+kz+GA8Y zHc-F0BsBLx@os-M#X{@Web%C*?>r>W{oq&?cZ^+~9A=(*oBNJW(A?X3-1*QNb+#xZ z0I#CArHcQ2^j+)I!)%9%^w`uhhI_)9d2VOlzL_jZW=FxXVCE$0U3}($1av&AXt?*5 zhilx~Fr6z?_B34Eew^;Tv6|W1Gqv*AL-UCJD+E173x}awk0lL0Su8u9TQG zE_u|FkI9Am$TF-2?K3%j=W~P8cuPXmeW)Xu zG}q~>ID+ElevI6|E$PLh1gSV?=0Jl7TD`5{x6w8QZNrS2*`o@oiWDT#Q`Nnv;>Ab= z-JEP1dQLbDH0K1Q4j-Xex=SjD0Y)PeI$r8rm5Rn8=8u-lxQwvIp*v2?$Us$_7ZjwP zdPh^Q`PA-ozSPqsa+eJv2yM^5n6?RuMx-j=v7o zMw`v}co&K!-0lH3N;3alx?k8Ur~(pk?4x<#5hJ!Db_p=ZKlKm(r{EIGL2ZZs!SpQ! zXBzR9Jc!UDSeG_Or0)PFXxSvlrhcE%JOMnw>k$%t1aEYJluIttwJ;D9h&_*$ULH5` ziH!V(Ee{bb9Zvubo0L?D@X*iuJUSZ0XCVICi<vXbM9FZX*Xa&543_X4gA;gF3k-q)uG!Paf z6ZsFj5$9T381sB5I<0z{7!|V(fPRcbb=g(}fh3UA&P@Y)tEXUWr7)~3qzCzRiJ=1Wa!7xy!}uI1&*2G7`J2z^ebt8~)#uxTCxU|PY! zGI-hSlWldj$BGmshpt=<5H?~Bh*}S5FfmjgtwHm5ef{6H2|OhpI+WdL=0a7{QRXH~ z3|uPV;(P( znn&xVJq3VNKH=^HuX>5sP?Arp=~ zpKL2WeeHY@5s`v@Do`$DnjE<2W%2)?ac`~IXS4B3`JC3{=gfro|5w7c}C zN#?MlHlBfl!S(qhsfxln#)To87T^0Dkee&glP!{uvu=L=esR*L-P{7lS{?hO5?>{H z5xy_=Pq7jRdYi)6OU8OFBS{E;WMp!FJ1qP6nS+%MTk(jnvna3Y&U_&F@!>*)oL0i; zV}|5p6-dvr#zT^S{l}Q^rJ?or!vFz*N9g?eR9A=9rk9NZ6xm%L+xFQ1uP`ZmSCruX zzfm%NMABHSUe`6S&wBdJmZPXTEv35zY1ygM@sOp`P}B$x1UpSG$g7E`EHWt<~`A*&E+YQMW0|{!e?> zwD;~N3CY_ltq>XJGO<{hPax3b2n#I!`PF>Pt-f}6Nm-`3goAVUR!O!)+OBmm!yHrU zd@@0|e@SioBocpHG~NKO`_b6=Ns=jL-*>aJ(2G3K|Enw987#fylBBIacDUAp`kL#t zj=$Hd2DX`m(-8>STkk?1tVvPNT1{*x@QqSkvrg!-_)I69Je`}wSbeW|?G*+iyl!sW z?!dJ0dRulvDrEkOOKD2*=Sq@TJgl;|e*)zqt0}$3+-hn3)Z2 zr)2s~Y%D;)ts#F@`hB7fd=Oj*(8*(;D2xz#T>Q(wys?SSX#7No;yeFaQt9E zA`gQF%8jl#;pcUTK^0EgjQ9`?f%T?k4_DsL2;7$-PT@nm>6r|t!uz3?;bHKv0jHu4 zrJ-?llq?D1y31L5fSqb-n4AR?V5%+KvwAgq8^#(Ep(fmCjF0GT<4nUOy!-_=ZBIhd~Oz%dpBA~7~H z^7F>l5vIWM>mYs6ef4K(Xkh5(1{!sJue7y6|@TAzuw%q-i;> z2ydCi>ZMZdNE9V0mIIC^CM%`qdBYIYKsusPl$>l?K>OZa;3@m}+gdS()cjeQ8yaAv zZy{S;W5k@iN03`&;wHC^?rKK{1)B#F8TL>WI2?7IWn~K7ZpGrq;&|b}BdF0EOmp^c zs&I$m&KyM>0)_wp;9%Y6NoIyzr_1d=5Xb_B!ZCcy6lw4dsA#E+;`|l*;l+=v#?;*0 z&+0l6_O@5i(Q$xc7k!pUYpNxgvVB)%wtFSBqaTTa0GFNeljJtU7z%h87nT(+%=4Vx z9ae)XVg~_#8PBJOS#rtE&(%$VEW=HLa2iOPI`3IkRZIw>BPT=aHjr`L2Bb|$k?;px zFs*>ON6Oo`cBWr28G2M+`Fxl(#88lGYhtmm#T2m2{j0)a<4m4 z-TgK|#It~`!H*X`DJNW!OW!i*1pNsgC_smBiohhI1kHkLKO>0qM1Wd_K7ES5DS)vc zatQ8VTd*NELaf&?1rC`b)Kl0ixb7A|C_$2ok&R<(m+-K`@4OH^?gY1uN2l@0@II(? zK4fRp|8QeTp%kh~MPoL6d2W6~3+bN-o)NW~D}>YL<|!B>sbR)dG1k6_E)vXH14n>? zkjZb}xUSB>g_4GV?oBExDwd0v0FFPfi+}Tm7Y-kHsE~kyf5H#rhJo^@=BNO6MIeN# z02~$-7G?ov3=4g*+Cq?4VO#{db7LhPXtbmDob&ibboBs!e*ewEOsDO^F;>tb#C!!D?nF$Z^kF*eYz`b4KvjU%31sl>`SUP? zISts64)8iMdLytx`-Bz1@Aryx9kppl#qZJXKN%!BWItD2;THS4dE|!hmGNJuz*c0? zaSkvKV4Rw&DhUD80^t&M3)a#90pBUQX``9%-WA>-`2i3Dgf2W7fsy+u^aJXLqdKlT zw{I7A-VyIoRW~H!A-E;z(H`7G=k3HK7lH;S><+t9jbb!3gN^z(4#f`-&PJRS>t0#; z3+CUTrBj{M)xg%Q6Fv3#3EW9|9^oG_RR)*+%qC+CO_UBpaIh0od@>B5CIHGU9kNK1XEK}pl|3Ou8=%h zLo%oKbT*)}FL$mPU3-RR=X>-xP64?JJuqQ+sA6s$j1qBhW2(+E!Y6D-5bzPPu}>T{ zD=_$jW8c1y;HCgjk;&nRb;55|RLB<(jJjuL68SB31lb8?YG&7`63YOJhp`g{l1(qo zZ13IM1ObKviG%=KB#215sAYEu;#kAzj)vzalyR1-@JNV@laY}@N;|-&Gj`q3pK_)) zHSd#CC&#P{eXs>}Iog5njQe1MQh5?S)czpnGN3OhAIHKL~`}~}|-Yz*(jdBk?*R zf$P{YWbLbn4q(CgK*PkHFr#^NT1Zs11aO+65xHBex6?t*<-tACW=?3od%_{xgDFDj zH3U`~84=OP7FvaonSXx~a$rrRqC)3NsEEd!QBWH?{e14k)Y%@F?*y6|7gJ^5%3^;t z)-NSL|67cqIrH$@i2pM$FERvGGC62#xc{`9VORw==kAKrLYZC~%T7Q0qykR`EqPMN zVLUp%x4!}`jedfHoS}ZRpJ+XZj$ycOUBs5q$`~bz2pAcbgmqSIJfPq1GMz}FKHTB<*;>R>{ zgrBtLK(){{D}q~!gS4QEAj-tSAqyi#I3(c!=r84dY9*AZ@OXGA;?t66pJ)SS2%%#S zih1}qk-Obb5xg(q0v20=+om?0zh75q`(T7Q!%T6%No@7zRh!$-IFk4Zfo#JO14{yB z0B8m(N#8<0`GniOrNlLbW^fSS{QUTo5|o1xe7k##Z&Cg)kg8n*5aeS~5lCz&t`2%84x&H6sgx+{1R6Y_l_K`z&Xyji&y$wGDRls}B!^KoqlsTqY}V3hA~ zCMIp6x>V5Sa3?D5MaQvxQj?=|pm}``o@I7^J_mv!YPEtxG^%`FhwBF<%(9K|-rWQN z0NWP?M&Z57ZdFC+3_oZtmE_VKTs%IhTrSCQRziubmYa8m*^6l;0$91 zEDP2OV4ow2#b)1q`hPgZTmPXl+r8PT&by&>C-2`XPMw4487o#drj7xmdMNPNexO)B zaOzs{>$m4tENeKQh72$=Hq{8kUYBZsp$vEE*Dw1qP!)+vW?~{YJqLV`%B}B^mW;TF zoL`8rc=~WpXY|L9D>yIUf07N`iDKj7?@r!KJ+tksZ^rFLql)s#4K(bY^ZnVADYPl7 zoIGb+?vt4E?3oUdATZ2GxAM5;#98130I~s!84&j<1H7f29li_9uZ5j!j;z|S=U(^9 zdBanWR-xj#09x)DJ7egTpb5kCKyX8?VFXjDIZ*p>S_Kn}YtLX~_sPu0cV2gPn|aMk zv)Fw<7aZgu;=wgsTw!k{YE?#@5Qa;Lm7Ne06@`hFc}U`dSwwer#&XeTTlLD|Btylt zSvpA|peeX~%665;J|lDq-|e{M&iPYfr_;u!5T(Y-c1U5(3B3qY=_B*Gaipx?vF(tVYG!f7hn|*8*M4US{ zH5Kw0P6=P9pLdL2>hS#n_43jTv@fnAnNfIdK=cIas<3Adk9{N{ZqUim(OoO~m?ON&^?2xAst_-51`O-P4D45U1h#Lsx^mNO8E z?`1c!lyM4MmQmNG-hC2`=gm5~AFAh9xw+vFqE56(V|?)#IMB9V`W{Z(?SwN66*t;J zfWpBKdNZBnGdaCe?7WKcPiDW}UgPixb(xyFlA7Vdyhx#ASAO&d9B|3yYQ`pm?|21S z_gcu5b9B73=;GGOxOs!|!3*&t8)Bt1?V4=!yy`G;6+C>nVv+Syluo*b z)6a|fDYE<7*re}viT4Fl5>5ce7ulgkY!U5QZNLBFt!tQ4`lh%|HRENYsRK`KN(>wL zrT(@p_XZ2wuN1$Cv0v0T{-*y>O5ZC$``~T6+$*j+t`=WApFR!Bdc33Iq(qyu@3Z)r z;d5Wnfr3&G8YjqD1}~=~$0NI~6Nzc^P>$p`V~|*Y3Qtdg&9+V(N$iNzzeyW8d}(Kze`-0G_+2vZWY*Vo)u$X-?$DYPx1W$dZ)vI-mNc(N@qg6W zKO1z+Yf91dTpsY{C)>OaC1x?gli>dQI*`bx_kVi93rP%8{znV&UtjnOj=|54AAg1)8E`yITc%|(G3TDRM@*_kWwocN zjP&k7Exndq2He^C4(dq_r^Xf*%WrrD=&LBAv1$NiBP1*_x4VpbDvNK?)}_Ajl5NHS=chtESzqd%?e!7>XuU0b9RmjJ=H6}jr-jTk?T^@4t;FG_Y=b+Q!XoNv!1{f z8jCnVo9b5FsFsqp)TA+c>PB;dCkq>y;<(t2EYcLeKsBS>=Kbea7bo3%f-0X@e6po+{yFQ#4^KauHqAujsIT|>HR2?rfWCnHZ5##E`fF-8QxdmeT2 z&)$tlUs4h|l&Z1!7Jqh6WJKIeb3Tr4zfJlV1toI z$D*Uhn0MC=6|gXna1Phcz7N}enVuwJrHq}nwbbopHSc0e(=FWx{C`<>tKZVxe?vML zSYm0laFuQLU3@X8QQ*Nn1cG?4JabU40e$UrJ6`^>cfvy>+5C2o0@J?S4cMAli~a|J zw>T)5`DwhqtY{%8&|=am*>2@`XIc8&x}$O7!uXj?Dwj#ub;IXz4_!}AeC7Be^IlI$ z#T?M24ZW$=|;}0s$)e4d$LYuNHrFvH#~W*-IXm^mMC~=$*E&{L1iCF%erY za`bv4_54TWy|tVGZk38#bWt2q_1W*84T z9n5yKgobK;bXfGtgfc1DyRgCY7nE5tE9EQ}L>M`Fgg-{`9C%K(ze~hasefXpH=kY0 zjhzubi+(RY#PIva+KY_~{4b(kf2~_}X@9&%nuf-J;T54UcBP1fmOh$ayMF7?Cf(VQ z=lp}*UtyKrcr1bYeZVdrvh>t+%comi%;dY&0=)8^E2%-0hpOn-=Llt9H?*@yF~B) zKB`XpDH+$L7cT0<2dg7kb0;Hxe`LzgCvL{gP<^WX{?(L(&^Y^yWXvEv=tIQo8U<1h zP6bcV8LHgUQg#?w>RDKb?hL)4ee0?)V`!MwhOtMr@dkPJV@-aS)v?`8vsNqTv;%V} zOZV82ORp|omg^X$76X7;ZG7SFopANP^mPJJI#=5U_h<{nnvL9dUVGVdUZ%f0@)7ktM9er-$yTVIlL9bgx!^!;;hbC5;1jO5% zzI@RnNY1rU6>Z;nDYqo^%^#f;r}zTi=d7pm5!Q#}I+A>)NYVu|&5PYK`KRY}08i`#Acjy1n??;k#XhVRaz&;H5XMw9u)8UL09WoL0k zK-O3TEDXaVBaJ1^*ABBA7N?F6-fSOt4QR1GMr*>b*7-QF%TD~g(&tqLd)K>iWBZ+7 zBv8dkTnN;B$@r{eIsTmeAeTV9;aT=yB>y+B|1uA}{v+WCY!JS%reU>5>zxJJhsgzx z3!^E6Gq-=GIX_)0^Ecz4J=WyXJN8;KbWlY|IQKo=Z9Ey1s=>nUCuF$Y_m!=4gQ3*< z?3kYEigVk5f)SVT!nQgA^|@Uqq|);yR#$saO#*+L6)}Hi9YFeKesA{FB>}UjEMc0U zaCy3Fi)UxW`2v0*A+|VygDM3>rOi99P zBOhNgsp(z%?qZa{SG)RKOpS?aM1mt=_3C-E6NgKsPL2NZ^`H}Zrm(I%Q@`wJT+9+` zEIwx5)p~13iPT)v=Dy1NEq;Q(!jksLHUFhJJSpZn`PD*N>-^v5b7%g(S}lxIJDid( z;AD66`ST+`%~w1J&H6;9s1IRA0vQ40QrW_Kt{avY_gC;_ibpTfgfJ|B6z#Gzn6PVY zl7fxk4gv=63|L&<{MT?l7f5|Gymv^cy|T8=s(rYeaG_Pwr(C7|_7BGsR0AU{m7ljS zrkptQ;thY1$jErdNF}xN*AP`kr+Sh+L&-^(yUP_{YJ+=Z8-A%kiu3BL#e9BD+1C0h zeV6su$|ctu348&+TtV=?@P0z-t#Y9;JF0YHF~^OqpjDadgJPeo!*8vwY0U#8>5f-B zh3?I1y7h-AM;nMXr9I=mtajeyQ(LvK38lU|nc#!UFVGN2$E59l5v z;*X;CJC5}5Hgi~Iu?H&Zs86nO6>yX5usQ&%^6~M9XiU8A>r+(?d=3d7^d_En@{uv0 zBKf9n+HT~)Hmir-0Z;dA=c*LRETQg>?-QXZNLSkc$ z1H`@<4xv!@fGM`^89ezt@Z|D#B{x3H$g6l48k`*k zpLsPL_WjpCRpgrQQQ3$2+HhO4FG)RWOf-5Rd1uSHL`9?Ra^+Xf*`6|f2D-N-ZlATp z?p&Rlbw8tVrj?T8xZAC|uOC0uY1vi{9ukae%i9}%Y32T$kQ4uz{N4V&w{8XeIC|`O z#A_N|_C?EXN6C`DjI=cM3m0sV;Y*cR{s(@IWj!DMv!`~IY4C)6#5-yG?vKL`+b@gk zNEhBRsC!)>MwLQNXc>`vtbHD7i$v_~yUMkks7gm!-65EK?k4wIWR9bh%W;YfoFTdw z(A`|x|5%2~L*}pQ#Rkh;rof_rl0y|oI(kiH_T6FolIJ29?)1Oas1K$z`Fr(6f~vVl zLq%}~nTFHP$Ad7FxPy7((b=ht_?rT6YlC4X^LLvx{EWiB8w zLhdJe=-{d}+)7on2iTP4pP(ng*GfTDT6?=R4wfO<9+?0@e7xW^&8WwE4 z51T0LvMjs$rz1nqG9WxgL61jJB~_4dV$Cq8xJ=fE=k1%G=MJXMrEXmfq9wN9cblEJ z8j>Dz@>BR(O{-UE{&;8I3(>O>ZkER8>`b!e&mRij6e1Mkiq z`|jEITn}2`@72{e0jz@w68C9h&-uVByHcUHRhh}>S-fdzwj@er$K$_2_9Yk2=CXTN(~ZM zq}JWO90>m2$n93*9&=cuzlWZByYGa8|HTaE+LtEJQUC#2_)++;s~;=dcQ2C#3KruJ zQ!Y1O1^ZMsX;AkWUi0-m{mat$ZDwX1s!I%G01Wf7k*#!K)b{u`E|qUJ_FH`(Hb0fG zvg?$5o>cq9I3xTy&Hu)PwDqNGY&Kji7~FjQ@+O~0{K3{^GNN&Puclo*Np&+R z{xLIfQr(0M82)xw#~8o-Uu3;^JeGa@K75%ekrhgIq3jlAi&Dy}Y@w10N%o$R5tSmz z2qhuP%$5-;qO8p9y=C(pSNHS%J-_ekIsfSHu3Xo7o}cv|$FMo3ao;b#dH3b6`cq=> zSN&}JcT5fptHxzY5aFu0uCc-44BS?vP%w_MzH?`4^Y$G&PMI6z6jt9u0!44|Sn4<{ z()n>o+^W9b9^n_P(|x4zBOAB+Y3IVwxtUys6i?QB=~QOShQ%}YCPIv*_`2!JTrQy9 zh(-(W3t#}LX0Wwqnx49GgRgM=5W4!)?n?-h75|!01g*&g8NS3s;F%mfFr&Z#frTrQ5EcGFB?H?2`ZCWT!#w1FnH`Z*>FD{40;fjy;Qv1RDrC+@O~H zEk0JlrX!Fz(|uBq;@xk*W91({9at33elwT1eYjQXEwyU&`;wl&))f6g7oLX*P7Ny$ zq&qV!Hxj|kQJs-L_sU4?SL>>2v2AY5|NNIXV)Br>z^WNDKaKv%a)<2n zs=@rzz=2nuNk2XE_3w@yyR5AZ_^FnW(FU*y&yUs^_m2tcpPh?(DC=;ld9c#2<}<_B zf*}!AS|#^okzkS#!Q&>!-Y^qwO$L<-wcS`F5 z%J{EN1wdbX0HoVyKE8gA>dQS+yF+ZMmga@D!74&;7YN7SKfD|_zBx$h44yR^J7#v{ z#=Df1J*=#DMxKF2x({7hb=*fUnN;uomem7w>VpCT@I_fC%3FMyyI|YB*waNO^6Z+_ z5B;gMn1a5me;e+Iv`u73w@417&ehb^1pG*Ivz(sZgz(NUG9jtm4u>TB-|^2|>guxc zL1yYvdg_O-avi78_-<)!)zQ)l3JV+7PmX7Ketrlau$|l|BSW2vmRrT)D z-XHm6XPoI>VG~;a{rki7d|Oa{wsTop2_q~$i-=HraWwEeA5i0r)4DD@s&`NKcjotP zN>>Z6nR}i#G+YO8l}I)Ia4o>s_mrtAcQ?KFbZ4O>&F7YSB`bYA$ zhBSDvQ%)G@yj+vp_8aUAu!+#IK{M5_yf?>kYC{ukHq^Xvw|RMvz{qs}Szrhe zyi&2&|BhwFeB}m|=d(Y2n1#wNV2vL$Gs_Ca&U&&iGrx(CFFP3yETTK6l;ccQ{%5%c ziNY20c97*hg5!)JiZA>GIKW^!A3u83ZtJ*L3cAPWJOR^!8W$Q8yLRnDvrLdxr$~OL z`m(F!aNHyX1lJKmPIu&i#SYPv=k`KAi87=SsB@xJy=U)UM1|-m4?rT~$&)8e+wSjS z)p4Squvw%qa4hZY>FKzSkn%#DY>di&ka*XDtL+@k_C&Oc-emwZBS2O`%g(^SQ~H#s zq=}&oO+%<4UskG2)js1m8f@h|c8t9{0Un(Y3OCTwMynOj99*3f1#&lU-8unsBdA%= zpX-{Ka1D=nJ6fTeYIx;zdS!dc9Bg%=Gm3gqD}(Gq6e@d+$Lx)czhK<~7yTlnQj<8^S=zCcQq> z%Nayl5F}7|S5{J5-`Ienc!;Bt*n4u4>5x?9{qN`Z+sSmZ?xZXX_(^^nU%a{gDK0J! zWf=ssadF0KYP2P@Gmk;W#v4Bz1?{J~FPxq;^~IDSf+f`!{Xx_7H8Y=wyWT9-c(z0Y zld$N}Q#&qneBs)L2p`56z-3^_VJwQy3Jh=q)ROzz+123yAO|!w3funv)yAG1W_Y{c zF<@Gt*~7c_K$@!ha^A_1RCG_1l9RtwRzjt8vB&2ApO@e3Fa)5~n%udg#zlt>0!I{R zk#)RsMxW9^QpIeBq*;Q704z@c`az>SZIN&9Lzs;K{{sH2^O)$iKT&J#v?hi*qgM@Q z_m&t-^t!`830*`ev?$Rj85+8wkBUIKKjl)=%Q6!&u#G|3f&BlM)tHL<`YDk9b8=!X zIRjz+^1*`|08-JM`owuV*~uZ+imd-#hfS4Kew=57*GJot10cvKJi=q#?X7;%ff5tZ zng^Bk{QP{p8E86SNr0`4om{gqboZ&1Ur0_86A~~@m(W20u7iY(vaO>dtKV3KXgLUK-&3p~!I22J2($EsSxGF1Gp#Z$uewn>wM{?scSvvQ%NGjJrw<++ z#ioQ|1&DwbHircTyrm!?GBPlXmF(`;00|gRe_7nrR@?t z8j_RE@7Wj^MqqrexIILLth$%4#36y2Q3-Iu8j!;rAbH*pu5iw#0DM#Eeofy znMFJnY`DY07()XCi#IwNhEhD;-mfSN-cVo{t`@-COZ zkum(G00{l|jmYSOaau*h8b7Pfx`>bv`h4N(QH(?Fv4Cc${aJ=YRg110J`KEDT~BYg zLtx1u1;T~jV7}s3@EaJR0nnwQ!bQa+EPR2B4qx}%!!#`;Bf0S=xc_*0=lQbUyn*}* z`iYpnxJ_KSTc%c%H|!yA*0o8Nf*w&q!so{64dCp6)3Q6c0l*=KVBchF>1acLkoq6F zhTQCI4Pw`{weF9}Yb;nC&S2AArD;d25eFMS69a$?H-=SbTB- zTo7=mhT2+$k{7kLtEB`81$kGib|mduy9fjdG$UNCM~&18nNHv;;%=Fl|6f7<>AQqp zWzuL*=DS26nK&8Ob`Qvd{l67!tq z6DEQrO$!a-xNdtnwhs&zx}`dXhB#3O3mma)=KcNsdIxWkj64N77>|mL?ZWxDGl}1_1=h{&@c^5b zT6qCUS!`0$03aTDD6!mZOn~$OqN_f3dO{Cq*tnKfCxHN+m>T!l$nxUVt;<2*2NWxy z@qk?4vA>;6I-zw>i5~Hg%cH$3M=g01J2Vmk08{XCAOle@wU034JuBVO)b#oK;SW}f zGsl3l!RP`)hYV@@cMdgPTp7uKpcZ^)t`(^VKR34xz`(!@Ll(dTh=$@CO&Wr+uRU2A zP1@k^e`o@mimzYCfX;O~Ltdu-;j-W>ey%N{-dL~)fkwpkg8~k8P!`r7KEjAF4#ZKc zL*j@3;K5of;qLB|;kq3B#mq5X2|_0_Y@!(;5uOZZ7nGke4XN`()>h0-V<{v}gj)fx zna8w;nFd5f%A|OHgr(_z@OkEw>qz`ceqOH#}=kV@0+Jt`nB zZUfD5{1d;f9C;0eO+Bw8NYQcpV(nYEWpc~N5P1{WQ=GO@5F$*v2)$uBhUll_vbuF^ z1wyJtyrdiKxPPBo_y&PNa-I*810ZGRZ86|aJIPnfy}2&x=pgfr1IImJP`^fwdU!6RD=Pp!Rw54l{Fx}FpsGrZzwPzH z-@2JvS;f3~aY#tW^7ie$0B$-;Viw}-im>n@0#*N?6gI-%&(gsRgiewba}xDLtrTpt zH-uZ^lGEDKLYzgGzKeQg+Yk%EsR)8jXl(DMrx%BlUlDKLcA!DQR!R6dnkEO$9ilnV z@87vb@yW@K12yz+KKx68b}SX)J+k&VIspXFY)xllYn8u6>*k!hbIg? zmVrqJGOVK%5(yoDwfA}WzH>r?u&$)5yNo&qfSz4>)^S-`w!|o${*NVY!fKpy5~z5j zLZAPHs$ozG1t8iG;{pISH4{$*K-kMp8=IYzgPkz2GL#cAb!m$hAw$JJcie{;#|PlE zE?ju__%U1&E?vLAzxz+`ElhK{`YgMX?94Py+lccOaMnev65JEclMUxjP_as_SW?^1 z0Mk55+Is2WxyyZduCfFY zb#JVqB@7AT0RYT_X9ScF<10BcJ3BfPggsm2G1X3d?rO}P`34wyAvyV4R|lIQ<6@U* z7T&W|z{}&Ly;5I`Ng*jEQF*O}mGDL_=8%X;sG}q(7_b!ahG7lv7zt-o zbTsu{(oa~F*d-D2;=(Ztj4Tfo3ccY=0HR(_POc(EnwDk+WAyVa_7X&ZsC*C<^c&#U z>{~yHP%5jA<0T#KT}@4Mn>gQz7Y^$G&T4dQ5XPttYj}EY(PrSsa2jJ46SrRM&bXkV zhYLsc?c21hVuTE{K*&pQ5zZ;wP#`SY5L7MZSh z=>@Ld8}%=5;k^UTh%JDgzFcat77-xM#_%Uk5O`nI(t;0d)%C+L0m14v zFJ`Q+58#9ciHjD6O# z;^nRm#!W6Vrh~wdoRox+Ku=c}kh-}qJ$blU*m~kS*7n_!fR{U>IH#%8e?PV zQQ}C1x(h;AWOLnL&O$$^92qnQ3KwkHI)ahcV9Mf!A=UwH08YUO6d~pTj4GhrkXhKc zUSB1>CFFB;hr**DKQt^m$O+*#%889gQ0U2FQR zu?l=KoFLj2Vf67vbclLGG7;BP975wcwis0L%Gr4pR(n%Vn$VUbv>%>%F1Sq3Ceh%2tF`z)1H;&8j+C7 z!+j570oWYK3L#zvY0A709~xU)a6sD{k1i#8BPZnzgz$Vp{C}py*-f_jAI|% zD?7WRk(NSgFY$8tSU)*yOM4L2j2U(H4`1Zx@uRiFr*MEY>z zM0-zJLI#3f+tq8uT zxw(JQWW*+$x#N5pMx0m|!y-h2K6pYw;;BG9GXz8x7zer1C(#86NM;)ez)aKNcErw$NaP zSXj8is10itSuAQ4NM9f&v$-Pt@U-jtvd!JQcRPp`5X^u*a0N!;ka)yAdxjz~#i)*p zqPDgJ$|ZOis4~Fv@A%=v=!h_vaNEdj33WF3Y`#a`4EqrdRUFGoJRfeTlX`O@Fg#g@ zQd$BMuY(71HmNyCy!HKyDcIT7MNbY)?(|hN|1XPu(yZ(9lp8T1T_h%6b8>P*G>14q zw_|!cxgrQc;4;A1cnl;DYwIn13$V?(*4O2wP$>hpKlI^4VL?HnbR$UKt7q?9PAu6k zrWTx+e^gUjD{PoAhC+>;oF@!ZkeGsH4UY1|urN8BK0hE~yGeFXAhz(drB~n!wj-oa}WV&G4L`CVL$c})DW0Cbk1&wI=wzvr9xXrzLho`TTvsh9y3pqhu3=u(8R69_T zv$WgLSo*IW(Z=QKT!Mf_Y=X^;13Has$TubRKOL8lXsxJ7%+FuO>_FH&V0fnY3??!_ zrbPJ;9|~&3EvRkt@bUROYUpCY5FR1$12p&CrAtS_qv6vt1b+=Qb7T%1_zWayoN^um zhQ^qN?zMvOUF)9T$tomEDuH+Ll??88t`pY2Ykzc^j23>8`2Pk zx}ffdtI8QY2))|7y0^^>jg*eqts*JFIL^R&ZJ&%w_Rk~vUUdLxBVWeU#Gc5@!viHH zHezIS5!`faf!KI3q>zC)*!{=L8kKJt)`*+e?}NGRe>ozh*n7clbu~wVhmL^(#`!5k za*zhxxr68?o|hreZh6KOPho7_g8~7}_IlYh6 zUDLX&@q;afClVeldWZE4m%cV%Kc?s~_UWU;E!$|0^*3rfDr(fJf9^eC?Es%2XQ15( z9bdYQCkOJuAI5MeL=6y&0=j3A;VY^r=cv7VO}}bMbo;HFDNlYp37T_ZdR&=JN9_oY zN~m?3!jT9W103L|u35OBm4!YUq=oYGBn%o7OkycElo9lxl8QR1nWbfjqmZFb%NsdY z)#MTVQ0;;Kvk_(}Ct(zD-Xqx?=})YK*w5d=NoX!~rAe%RlBOj@g9{Dwz*tP*w)XZv z?5f6MJ(D?f%&~3-KJ9z+F8L~9hg1TmgY!=MbKz#ri|^O}d|G%hGEnNPvy0^c+dC*p zI0CYdKn(}VXDQ~t@|${G%a8+|1=kZ=1~|^Q;jKM*TYUJ#;M$_zjV|TLX*GWFk9iN1 zw9GvA47z7GnNAP>++yPn)yZ!II}rN{wlYk@^_=@mh?7ZlG&TRgx6st|10vj_Ykz2; zZ2D(wyIia)JmYHBZTWGZ^o_|4t+cqfeq}>Lq;Ap@5{`IrPG3G#B4sA6Z)gZBH}UiH zTgV6x+W#)?&HaWR`Y%O(v70iki;K4vqHaBgV>vFYJ^b*?Q%3Q`m2j%KUd|PACLiPD>gpDlevNuQkX# z#>T{apDOGw^(;lr0s|GwCK1Z0`6uAvce4H5JLnY*q7BH0JR;0+XkwXQ3xZh-2a+DH zOw{NR2*dI)$dOBz_d!q~!;h90Z6<}022TkFwl!-K##b_WGP+iqldIDr>A|IuZ4Wda zUU*0@!~#QfT^u`hcEk=?ySpgTGXfhJ28eR84|~E563u3d0d8LZAhB*)XT!EjF%FAU zt`}6Pck=*Yowe-d)$pE8RJJwu@Cc3aD1EfdP>c}W5u;-FDN&y#O$cx;K*JFS1dgow z7B+6=J*)_=bisW^V26q~6h!vPYs~%rc5&lpeQK3(l~yn6GGu#Wln2{iK%>V94t`( zwP3b&d*2Ou@Mry}`mp0$J(Z`w_Q+Y+Ejt^`1a9AjSPuK^5;k@i(t_kTLpz+DDVGc} zRDY{zLME{}fzmZ-CvY{dlwvNkK~9fyVBftKB?lCk=#Jv2Q5Xk}_e8fh;SQLl2s@yk zSeYBknv1>^gwFn}Z1uj09gahVTiT|t5>}#0%E~9$clXkxXK+|Tq9@0KMVlr;U<}sc zmo2CY$FoqqqZ(MDG$e92Ij-zsVuG~nQO`}c&}uTL@1S7g*oHWv^ASfWltRiI5w&3V z^z`(!u_19Uq4oIO-ma&kvjj-B%qLwVqhis6H&F;NHD(zpMJ)uO2UO1e9JzYL?m0Z) z!I%=^+OsMB&}cj$+Vi}@UTyEm!1R{FaU<=3q2qtEFJ`Z3>2I7wTt^pn{Jatm{epf`#;F*nX=ev8kfC3V}E1NO76nD?x1Ct`h}hsAATAdR`DifZkT@z zS2KvU+RLY;W$cuZD?Imf(f|6Bh%MiOJNo?xF>#>>`*P~Q(W6hhq8N!=j14*=VEaK8 zq>sHuf@W8Pe|Yh(bt%=z^PY~QmTngVGJal}*d=Xrwb=6heO7bQcvca8nP(gyJkE6H zy_Xuz9-OW1Jzrgx^3$p#sBgzExEJmBpgQ4^_olAJDHs$X1OmivtG6BbIHkNiL_W_) z9ljs%aZmhZ$NKK4^Dl0Ry9|YxG7SyS9MR0_G5WE(eCh7-^6MXccMQw=YxTbgpgEAWGn=GiL*liOLHWk z;Jcgw7?;nVw=JD}%c`iF5%VeIn5{o|)X~Zg^K3DFhmCI!=N#gUU2ZH?C|)Z6@z$!^ zBJd^m+5N<9>cP+21vUfeGZwYMnh%sn?XwpJ4*d}{jgEKY^WjptxNy}zsA*z6O)o4W z(=hy7fnw#PRz>9TDjNS=;qR^{Xz6=49rVdAabJv8c%;cb(`9t*tn%f+8X?d5Hy@3@ zvFrCAyL1Q+vfPj>4V%Qd@q^Po8g@O2XQ+IjqZF%3VjmXeIcV8;ms8z}!H3Q>>bON3 zs}`F*U(rlQuCM>gw56R*gk#Tg{kT5Yv)qodXL*@IC2ZgNwT7flB$?&%?gN{$FAl+W zS>tY-+Eal+msdq690E_$D?IXl{-~y5VJ@0`=^f}~0@`ccI;U4}O77eEMK`r*U%&s| z&P&{f>$0-WJ&$n=il;f2WA~KnkjQG6ucXi0d{z?E7VXyTj)RS_j&WSKw&vvIY?$&O zsNsm`B9?0qQy~wWr5zBgj#eE$T>b4Hr-~AsI|K@sB~^GI+L((Dy2r#nAFMjC^C=hi z!Nzu3vbp7)ZSLQj_X{Y_CFV(|gkCx7^?w76Xm0#(pi$QU1vL8df{XjWSspRf;B(Kr z9DXbCmr8r~>=pg^;lnMkO8-viv9#jN5?pvu2}M_UTA<`rY0=$3i5o&^?u-81qj2Y| zCBL%b+J@0h+lxnn=xUhG@n7U<|M5#VsltL>Ie8E5lZUVh^SH`GrSHtY-(h(yeCr6y z`%j*uH}(W^-K<>z_Ac&g+o}{RsOhs&J<%r1qomc9x7H^8_;q#n_NJ2ll}~RtL1U#0K=~zY68M z{26T9Q~K4;i8n@VtEpZ6xY`2FCWF+#cF6&uHLmNA7d`)kv z0%{TKSzc=hG!ZYODT47!^*>{tK^Ivm;aM8+@r3E8L1Ol8zbR+;xb~*CZ`$L5;D$SWnyNs_s~eYc=cMb5#?dy zmp#RUhVNtYe1AnYlc%&)&U;SgYxE6R{S;T-lQnXY9`7MJ~TF5TUVdN)qfvo$<@t9_*NSmQ*Nxvbg?akZUDtGFa; zU7rXEk1pIltz$jU89_6i)zl-NWHtF+H}7?feN?6*F|6Fe@Iz3t7&px z>)tshmF@h}Nhf{w4fiR3)E>|;Sa?luIp6Tn(M1R$D2l7;d4wO1|5Xb%+jWM&M?+`1 z;k%QE|Kq%dvGJYzuYX!HDkOWiAmJHyN9SHd635KjoMTpr+{(W;+mc0Fb7Zf)>hfrQ zIA-%{B|!FO%J^b!l7b+or&P6pr$y}H$I=U7cE5M&=orVmlHaV_QaIHi$S$=Jr1#9> z-spfS+3UBT`_8%ayYH&p{j<>eAq$$*TvYdek-WqRn>702JVEQ-!5DeZ?5gSqkGrZnX^rPnl=^r!{? zVOO@MyUssdI5+GSY|H%8y`tOy8%MlnL5ba}$Iz1>{oU^-<%WyL7tGUV`GA=~Njc7a zH%_^-TvV<$)3tbI?=7Z(S+l#dugKqivwh@LvQmqTTjuvS{iid!?S5ajkXBJrO8PLH zTzl^m5C{?+?1UfNkQegsusbf!)0uFO7_P_10vV5e&SR;hET+(s3#s4 zZnXo=jL!1%$0Q`O)?D||S(3^&C}JYJql7^OZcy}cL#H6%@b%BDLUaWD*u>D{NTT!H ze#LD{PaZu-v016qeHZnun4ksOR>r zWTPOcH4@MIcVM9`x`B~Px9@Na~G)R?J}R`uK( zk|E>%dfTGTedD@KB9-nmmF1|IHv4H*>*9@cKz@;H`CW!e{`NI zT=C4vT3g-eAH!qfaE4lwT;aLd*)8Gd+wTsF@|}OCvG3?_tHr3{<{oX?Td!}u>KHl^ z^6T>Bq@bF&H1@izOgiiCUo)y-X6C%-ZGN4i$aC*^MAlZIps8K%y+dzbSLph_66G%u z+lH_2IU*(o{0$29LmsU}_%0M^z2zAqT3BVqug}hGv4=^YJe4}RzDr8E3_7IG<0^*r z+ExROB*&Aj>Wo-Ap?13MtYJZEj-yY3GB7*8B`WLfUfvU-(?O&^CkSEqTrkR*rqTfJR0ya4Yg?Cj7rzFmMp5}tsc#o4>Q7xyKS6IKb3yCk5OoW? zpzum}#kr>^Z~NrzpG&6@Wb<`6v8O|s>(T29WZjKReS7~;bp(CZFPW|cm#e6n81=l; z)z_b^u3`FLm|hkVi-UscrU?%pA4%$n@#4BAqJC3(gtOZ%9$heP87an*6Q_+YpY~Ja zVW1*^(h}sSot=`hU73#jmWXJ^H3lWKqz-G20}oW~Ji-roy{E8Ftlxl$E4KC9*8G_6Q(+Wz^j73B;KYR!b3o{pM6CkM&pESR8JlJcy;GwgO+qI5l z9NWkw3rCAjlG|_g@1u@3=dGj^O2c!4X^G;ROlCzra;M*>>%1fShg+q+yHh&-){1$G z?k!gYZ7@FZaFqK?bMb8mwCT{LMZ0!J{@KsfB){RKeo3-d^ZOdcZ8o0%c$*S_`2kJs zY=gn?;;&AH)*)^3rLGUDnx4+hoE84V5@|+VeX~|05KC7eJ*SIZpvtn(Nl($XX(Ru4 zZtIV+M3hx&_AMQD|ePuDbWz2Ec-_^+rJoa;SMe-4} zXm!6xye6^2f!&*Vtc5^A^Z^G&d}`qs0J>~UFq`B{zH@#2)7VWzQ;R~2b`jEQ0d?6= zP4-VCiUjTp?SGYKE@d(P-O-AA`@-kvrNeEsnhc4L{3@SOcGPdI*~Oo*xSbgCdod_9 zLE!fzw%isozZXe#XH~SU{Knr{sMD&Mba~S6X71e3*Evs64UmDAi^3zc+wWyZUU^wq z9_2A9=EVy|>Q7_`@P*elUYTjICT@0DKtD=+ z^~we!DMe(X4-%EjT+VQ*28c&}L+{4f)%BVo!#?UvS!#0f$0{#TtwFQ)W_JBtXR^O;o6hM)H;cG+9#&-1eEIT&pnlD=-b?bohXhb=g)k3OSo`Frxo!l#mwfhuP? zUh#L`eNwNjtS1wWO+A;ej(zjy1~jRbA~6koH&UKATtjVNQBe_^o#nX~2yN_37vG%U zbKH`ZKkr^^H3cmr_rdeBe+yn$=!~okcRmfA9MXHct6CEv{OgBTP*aler0!|RBHSe{ z?3X=qBwUFHJRw_w*p^{WR%&{`Zw^E2cSUw6YwC|ty3gi1=PtJMCDzmG*=U;2Ped{r zz912QXq92daC`ogr#-`2-d@c@PX2{RgOGztdKq=iM!D4oP~_Yv>n3R+cM&###OOmF z{VXCqq1eSM(5 z)gEF!g>&rq)L`JGj{QnORvVba(Jzpykr12Yu~DXCT3=|{>sohudPjCu^#r=)O!v<4wNfoQI>`ZClIk-rdiBG@Alug z&{7xHb@?p5rH#w4vEYBO0B9)#X?S`kD5xeO1)?qPiVqKhQ7w4n;!kbp5K6 zrC!^EDX2qI7Fg&XTT4aV9371g&=go5zj#xFGlUWSBnT`8!*U0rKT{b)G&r`YO3);b zl#+s~{?PF7wI;6io9(Zfc@L(z>TDafmo!#&b-Z2iN+{~!`1%jbSMJk zwz4uW-!7BT^sZdMx2Xwy5H&(120*5O*<7rH9Z!@-#e;JDSmG|St*B-u? z#?$d4LRJN<9vvyah5|Nk@8=#T<1FboW9(QS6{HbUe|f9zcBPvU{bTMUo9@eR5~8~G z=l{&6+~-ZY{8DYOQIqNQ@Kg>H@1y3JB^fuJ=Fd0N>w`C1EgsQ%znx%N9Ne!kp2}N# z=#gJ`@&?=1hxa+J$E+55?!39ZBZ;&AV0<(8dj=)F=~QEN7kiFFSwpjwj5en1)7NU1 zX&#!5varxFCerG;6=qj(cZ8{&D%Z3={F zzG4r0bcS+kR_FYUWlA0E%WT@3U1mf0MvQFxPv5lTT& zJz@ud2*4yh8jLV5M;nFK{)goz3(^v-)Dz>R8^Nz%{{y!5luY*bpGtO|vwOf8FRPn@ zrw1+523=SbCBbW**rbsu+gQ&UxZ+4jh=bEjiA}#hd|uJ|hUVXc2P#X`qeuSBQtrJ< z^(dA!th!3g`&*1@RMTt&;N>5M)5f7#Q(iP%1%{+t4Q51?5> zfOlQQ-_P3oj^e0RU#YbgKju6rjLLJ-0ZfACMa=DfvBq ztm)#iil)26boT{5>Y-2%>}6$+faszxjmDDxl`D5{-dsli!gjJvw{&i0-W;z1=q~7- z0NalyF(7=~i0HPH;vkG*CfkpziDRsh^Xiy6@j z-ba}@$L1W`-g}PRfY3c))+3mmKp;Rr>$8-=dt$@c1<*hzD9>ZX!|!i(JfoD7W^Xp4 zmHfO(eMn0pmOwikdc_`a#Dw716-h5J0DnXt;DTSr(9qCShQ+oaJwh87FK-rJDd;zx zoN;nn>laU*y4F=-4-5xvuxvTdIfMHj03$%lVq-SDejRWFljx)It>7>)6F&fCAL985 zAPDg(@U=iP!5A$#G_+buWF3NNh9z#!K$xI1kMTYU{Y>mg(A-TO=2`6p>wpdJbO(=<-zOM z2fAO)(h=7P8{P}hEy#UkT^4SFQi0|Oz)$!ld+ZT=ANo7mu>vIs)MpxYB_eSDnIM{Z zAbG_`MlJ#0ihLFP70OJbb9|XOyphk`m=S}DfPxVX+)r#QN4Iz8AK)g?w8Ioa#|oBb zX!Xwlta01gT9l~Kf3r9;JRGqv(P^}@y84OAe)KQF;KIrS%%bSjH8R0G_?AR|YT^A4 z^5WHlHv%_S_?(Sn=?MhAh>9Y5)d6AL+4)kGp$mJ=e{Npgm|MW9U~WS23E)P=aDOsJ zb7;P05vbCT*o=&f;7dU=8o}1Vw4(|EFt`f%6)%+t%O#WoX5|hYTcr0KP(87F$pCbXYD-2LicNGA{D^|1TB>sh2N+f8g~9$jZv1+J&c8ORlS}eaI^Q zxmOTb4E}72JRGPnycH00fNOgEcoxt@v_~;T0NK~q)BA9zxC(S1#+}p9KAT@yxNb4z z2P5T|$BrL|>@yE6hQWLQP7I-E+NolMGo|Zu!Kf?YLOTtQj&3Z_o(&VJ2)7&b3L9_C%d3Lc-6$j78ZJ?j#b8wQ9GW;Ag?{( zqJq3Hk_22h-r~06&b`Jt2vtyUHUZ`*A~r27*jQO-7RgR>1pI?(0*?=qBY!KT7n~@{ z*zfB2V^8U;iD$st!mqeRpxLo~8W_|StSDm|)qKXRfS5;L( ze4U`6s;a8(n!(r-dDb~{f=_@F5299F>k~dW=`bUJ0*H*(+`b>y3SDy4SZthS;>7Ng zSc6<$_lD5^IEsAiWklQzz?ZYMLW_XRMc%FM&`t4aUPvIM^gafcSC&yjM}lGz{aaLGbRx&(U2}f|2X!q-Tvhe5#!u z#~usImlgxUNFfff8|LPT0z`5Ywwnay^LUKErKAY{Tx?Ht|%%d1c%!{_r*jaX$+Ruif3_o?8*Gs z9&-9@KzOdLo6C^Q3lRu+@n=;X$u;Bzg4e$R$uRungZ?w2tQDAf0-?lrB}<7IsuOyM zIUu3Tyth-h_ZlE+NC>uO7ZGS+UY0N3b%%NfEnbKZND2o5dIjsIVjQ6q)U^MGwWdO_ zdG(Xe_@C91apg_DzxBTMJ^Q=~i+4zdObi#vUv8_|G1O1Q8533`ac0SRtUr&IK^>ML zzxB~`%Y$6O!_dEPUmNj0*iEAWxLa!ny^mZ0skW~v?!D~dUD;$hq6I0FpA|X5E6${( z!-D%f;2`85TY|nEEvg_XroiDjBb4y>YbCSG-`vMcMZ6?EegOelg&EX;fzl)@A2TvC z79|o03NFux=Rnyu%=#Hx60hzG4`T>$a5I>mASIL~0yLc8$%|+85{;)h!7a$YCq;#d zI{4t|!+Dx>0vi#UaoR7K3npuFK?l-MzW&Sqq|nYF!5eNHw=JH=9S)o!<7sx;THAd7 z%oE2LP?3W-!tVBOCveKAPN(fU&-!<`IB$0|#n}M9V@7&+ubwn*eoSR;mcd0A6^@Z3++H~2c8-s72$>*$fIewJ&;Zl6ESDQL)YUy#EOaXt< z#FrHx&AYX&R<=RwX}#F@tMb>3W~jfRc_Gn2Nlq@mL!XczCSY+#=F;2Auq1{4>RXTX zk0f%qq4mZieIS1(G-Dx;@jOp~%ZsEp-&it6P?}QwyEWMJoL5zzF8goo4T+vim+=^# zT~euK#X@;y9%O`Ng2H*lq74NNpX{`wB&OO=wZ-UZc=Q)1ChCo(L_5#W`6=xe({ucw zH2CJt^pyGNbi?=gw%L2}#+9!JPo)@!T*+ME*%De;B6BONtqrW(9^pjudwt_&ptg>7 zB9*3^X6OU4ZjO6wQqyrq+XL(9$Y1?Wd5%h<#PduiWnR_ebjHoVwyjjsh=8TF=qm$p z5RRVl1}nyGbA8XWz)Yo(x|+)c>owl~N#6kHo!q=@vt)rP7r$2U=lB^L@;&7Aaa&PI z%+~(xnD;STg5TVE>U;AQhI^*7bSsi!PrjUPTwY}^u-X@}%e+oH|&YKa^lf>*5X z2Bb^NC@~YsiCm!~GOY1(P-{qxeX*VVx2KB7nx|YkG*L?8B?Xvo(yLeH*F7Rn<+>|a z0xk@W#rrQ+;!otJ5INQmyQY$!{K`>J|y$*M*rUGP=Eh?mf1<3 z#}EFft$9$)Bhg4_lb_QgI0b<}Sz5XkmcbZUiB0R^p+ooN_RsJlMDhwlLE`*#u~$C_ zF5npN3`>>+RYzV-VY6N28vth&iDKjmHP`1rim?ei(~G#B>p9n!B@5tGQf?Eq*FWNH zPfFTUzPsB7J(8#b_BS9 zQve?xUx@DfhVYh9Py!K|4T>(}6I|eoj(g%H#sz8%;)$C__1)b!!lvz^(y?_0aA5Ff zxE=uBhBOQ@2ErEn2G@V_%g2v%zzu;H`)Q$gQI$BLFOfkL?PI?${Z?~8B$Ia`7(pQ7 z5|B#~$T95zh;E+(9e4-$Y#jbVLT-R4gR6!RwsCYJB7(U^*b3_PK3q61i5PA~#s>~S zuob_-_gF2NA)Z141ip#Y7Q9*M85t`^GyJlM1L;GIED^vF543{^5$6FRu1p8V+o#70 zF8~pGc%Sj5iYzBF%F7zkLE6eMGgDtGAE^BJ;0e2Z?F2KZ~1nm4b?;bbzE3~b5?m3)frB{TiA1E(?+zSg& z0+%Z!DvF>bFDD0SA*7o@x+{|~`9>e8Wa%jyfMsn0NU}Oe{9vQF0D-Qod>;X6aq$K! z-PKSuCYqGl6(bn!8yTtj-QbmQ6Z{Yam6tC4LTC(Pc)-p*Frk`;62656Lb@Qw%%TTg zR`)4My$RzeNxn-<14eiVSaJ?&C;8V}#Jd93;2(S2d(t{y`o3B+|A7NtMb1Y~o?J!1 zcYc($Q>?(yV`*Lgkz#OoYAEW39b!3B5!{>q;}FCs<%{@n_tZj@Af^46$`J6xs3 zin%5eiZBrW^NB!J+1do}d*U@q|o@$Z8NR6z0TzPPT59zV=ifsG|5G4Afj z&M`80BvjeMoY~vE|Db$IPT@7oO~nAGaqFGHL?Hbn;-Dd=GCK>x#}^3>k8%?+U5kU{ zqzY%wm?K@odSCV*0Jas-?kbAsVG`IZY4srP`ZMq5hLR9-^LW4z7 zCMH5ZQKG`9L;E5L4sZZe0#q2_Xc!|;gm4-@iu=bn20vhEU;v(NvM5s2)}Ec=h_Q_W zqYEfUB8(MCV+5bKZr`5zor3^$`*gk}bu|s42kF4_WL^(y3rviRw$On3r`NuVq?;_5 z8b#i+vNDwJR#sOx7h8={ojB>Sz8u!v=uAAnd;&4B4KY%9JV%(AV}n2vi!uTGl2cTx5e{8H$I8=oANoHeG>U z3M_3T{%E6Yr>2H5KF-o!kDUT7!z~B{zzXB05T+vlkAeK(DEnq3cm}v>eCJFkmD`u^ z--Tslj8NHuWXnVqn)*r%@3`I9thaLVWYOLt`O3FYbQz9L$5qzWOtZyYfE2d zu?`-8e(40wsk(!(=IHl5xs-Puv%X2N@X| z>gUfhC7oqf(C1A15LMK-w{b&$VSLWtzv)|z{i&X~Tqss>%kGZlS7(3< z*|enllcPhNtJWXVLtj!w@w_^I`)|3ecSU2-JlmGL+Lr?wmZ0JRrYs;M6JW+eGQOz7lnwKBLlF>s*waIBz{l5jr66gC>t4aHY?@76?nawiq1eAs4MPV8 zP`B0PQ;|q;Kl~pE@q53`1CN3*?rp_&|RZ-VzZgj^Di%u;X25+EU4;1n0 zpDbn-F1A1LUa$Pt!yXL}cdrF@%;S-fn}?$;&CPYLTFnyGrdF-7a@HMS)q^bQjxBk9QF8%CLdS7Ur zQ+IqDHq_HQffM`Fr}oB1lmZ^xvJ*?GE=0~c%0C}+ayC%4UG{upMy5|+SY9RCJyA2s zTmO~xNF5a6xWG`zM`;fjF`{+SiZ>}UF-ua%@dnRu@S=zgkkc)7SLA^NR(B|r9_g)^ zx3z0`6|3j`Ps86C_ylC?ee9}XR4wW!3BOl*wlK{B zgqXUGG5Nm}rqy@p_Px8{#w?`U3nxaHfFc7Avb|ChN6~cl$9g-0GaR?jL$B-VGP1i& zPZ=H_S#g&yJdn52<8SKgqB`SGhJJKcR&Mqu-52$wqA@t!+tJn!#nKHJXMsYf#L`5| zZJdoBSF%=0m)q#hk-yM#uQw5MnCd)h$DN=c&faMpF)tu1QmQSy)%P%u^4dg>%7cwn zC9f~l%cS*WuCIGu>{CCkucq-j0n}?ZqDE2T8Ju|bsKK(kMLw@Erypn_9c4u90MRBC zjB$G4Rnep+H?(wl`-h=fJm%*4dTF&nZdU@=rQMFBH%QLxU+Ou~{#G*)Y6`@CHq@L} zep85G5KnNT8_|VX4J7J#v7UJ{EiJd$6Qddvi8&P-o%BrYxV&9?*&_|3q<5&QIMHo7GV)FgdBF*6_lu?dnNj#R}x8Y3%NK8x*e zfKwnBesn$hxYY|un?Cp|JrE&((2V%X5bj!tn-Zx1?0U~e7@b8K9R`r(6cqGERCG8t zbFgY~vK#MQD^uJQdZfnwElvh|KR&^qmi>_b6R)HwR6U7h_pX8iV}V?r%qq#D0}Viz zV0PEX${0u=AUS&tiYL9jcjpGq;(N&E%)OuHJI~)RGK#_3wwydv)OlFRUx_o9H+8Wg z?VF>HXLQlP`Wc&y!gt$epHzwMtWf#tO&{0K7$u?YnnvZ^tpEFRu-5EH^DmJXSd)@@ z_>+^Y#BbM^wDG7VR&;P&K=rT)!1$J_3jMx@IOTN~9|G4|Iz?2itvE5`>>zD3`|Bsu z5u2P76x>RCGDV`kFFkmd+;Qd>+l3R4yT5)Gr0@5Tl+>rZxFcU{X@%lO{otpF7Y-H) zvNqA29SN82ICs9_-noZ2&XZ#Z@jhQnl+{6XiD<{-w|~;DLilbV;8CVR4ZTqc{%Diz}QD>!wwq`Gx{q6}EqHT_dY-R1uTWWhE8x_EuR;trn zf3uJ1d3xlj z72khfO*OJZ_0mn&=Brad)9sPk)`6S)VsTpI{h_fce->6Z1O9}Fiq`I&o6Y4amU3J^ z?Hrv*549!{S=o|c-`+>q)>KiCKSiZyU=ZwjwP;~ACH0-`=HJ{+53;wNMb!Mff-m-! zg-AK@&}4DQomOpx`-1L2reI zrWkLqCDP-#<3`|fAlE-)|758QAqLb>3#3%oNW~k1S?0;qZ7G}@^9`lH3Mu)NTc&~$}uluPK z^KUDtNerq9XPGO#@;c2UYBl0iOH1=@-TS;p#PfX&_DDYt)eY?MC`f~!(x>?B-tvrG zJ3qRYyFGGxTaFRrEzA5j}BO*WRK zbXnx#_M>|eBZ}?F!))jhb?Ce;(f+qupO?o~_+zA3gl`=&_E)S+l(1 z+d5aa>}N+Mg02**ZAv$3oIQQI%IVuXMuSpb{>k;xX`MKAeer_vjBSy-roYEGGA*(X z?Q3I?UmYv^OI;7KZu0P&|xwXAt> z;~#b0N%=lgt9R?>jF^vEaPBDwd}yvp4k1uNXt2 zgNtYBTeChl)VWwmhIav{*+`)G8bjckRg@Kb4Y`fROXN(bLN>0Awz?rLgqw5 zDVc=|Q3#onsm${{?&InA?QieB)_?7_|Mz;=yXt+P>AvpkIv3clH#h9B$F zaao~Hw-t&X40ZkT)x&FUk}po)=}{Edsh{0L)T}e!%v-cH>#S=o8{x;Q~i&hcT&); zQH}WNYT~3TeaBur>~s2xv9?~`qW)C_?eCwhMAig+_tSH;3WZ(gS}d-2wZ?wCI&ap< zaME!D%=yF2p+m7*ugZWyAUTa<@ zIcoGQrgu$vTp2ePd9Jx#SG^v5Zq`i9<4~07!L8}&hGX`>w&eWpNb0Wf z71p<}sF*jEYt9Pl8n9JM*}2O6pk0-fH`Itws94btH0ORa{@{3j z`Os&lf+(iYh5p((wKi_4r9oABXA`@K!C$Lcq<6wZgP?+PF zx3TG5+FsjjV&hfnsCUgDOShjDdEWne>Y>DO2@c0uM=Htnoi`)J=htTRQb((gG>o#A zey!~s;_tOu{I)O9l78~b72{I5=*?k^m&$yY_}@*%rAg8{!}oWPyz+DtTpgP|S~7mY z;84Pvaku{xslmf%PYZc7TD3-Kj-~70cg$I5ZJkz%d&)-ltg>~>G*ECNHMD{GV ziB)xP9J?`D9r(XJdriEM|*C)8bX10KsR^{m*eO_d5c|vh!0AZgCyxAmCp2to=N!-682>tc%yX+~52a0n8pE#-w{h}Jy7Z3YleyciJ+jQy ziYM8mokA*%L(E0K#~RXH^`Nk6k8F#}xoT>@d9!Qy4!y2jlQ-{N?LMl^?%{wb1@`K2 zecKjE%h1XFo%5X}>4P&(39`ZaGHWjgh~0V7G5t~F%2f#ib>qLiYmS}v&Ch4Gk7^{D z_x3e!$sRDP77Uhl(wAnfzE!f?y7Cy3WX79j&ypvN8{YTLtf|d>FFrM@@ayH4tX7Ar zg7fJqQrAq@8@5KH%-uiuPR4|@H=YRK`ub z1NM@D1YdhU-tsfdxYJCvN-5)`VOpDk;MljBs(o|{4n^<8WCYO{+x%^~>;tIPL~>HKNYzXSxyWgzfVf|=!2iK@8bC0!y|jWVgZ2xjW;njZ(B&hk0Qv!0|t(HeYZZ@!91oNW}ihqBXq&Y zc$eY!QdU+*w)?=NCudB?lLIeWrQdsh^oMqITGQa8_p8(n&zA?j?yp!f@-{s#r!7_- zd}>18*b5dnvPk}bovAQak{L)4eq4jZ8@NQ7<6oMlAdXM9n z*>I(As3%21>el3hZl8i_V-{7!0i#zw`PV9Q(?$lE3tmw!snOXCmfXA)YiQsbA!5_? zQUC1SE3!H=o$f|EuAQ^6ymjl(!lgSiuhe#3bxL8G`e^ZxsMmdyn@f8_@hso0IjO!v z*W~7PLI{q|ZctyIjCA_v?XkyMZXzRuW*^6y%j~|QDP{>CrxVy`O(9 z=u7}}fhDrA@Xa@822w-fMwAJA;BMegaJ1u_JTwt-jn7C_fJBaO-&`^Ki8q~(^xP|g zV;s36d#X|$*>k@bf%Afs7pRYo7W*l$_~Ee^S%TXF+ZdcqcryLSW>(l9YN3Wc@oSOE zY)MoT zbmW>E**)TfD*9SkNo;)%H8NoPK!*EEw*y44w;q_-vZr|K`NV%JZM3`Ga7g|1^NNGc z7s3|Q$&*Gs|BC#5b0+(FvF;sOW$j=?I|W0UMs~;F-U~4oZwT~|WmNz&wna%AZRR-T zLl4dGK0!?G8UYuG2nx!;M=Y7@W^7_{@B?RAZ^OYu+yPZo?%s*Av*Ww&gmPxge<`#o z8PZ*@-um9nJ{=J3l9FJZ(M=Z42V;@G${<*=u!~zip$!fqo&eLq)q&LtRxi-GP~Sx( zJ+r_VZy}oxyD17hS(KmLDeh(Y{LIt*aM@q3K;I%C5zEHADO3|?U+p$L-?N+~KV0Il z_#i%^*rRz{nmxN{AZ<)tyu5c&X67hJSmYvUa`rt0^>m=dWTd3|H%VixTpHZn!;k8 z{@hzgzgoB&d12M-xKilK@A@yo6+x3_-@kp}cvR;75xCRusVN2YV+<|dv89^MJT?nI zDpZvMWq1iMsa6)zHWqm?sCL(mKCOJ6DIB*R`--h)q$p+6m232p$tAg0`8_&U9>iGRGwr&(un;-$?mau5IJsGF&h7MJG*v|4I<| zWR^?r-_NCFPcWeW-Fa0L9TFaxSUhWY7y3oNl3n|}Z~N4NI|Gv6PaD{r<6K%1m>lbU zF@f0+?gQ6~f~ zbi7|Pqwx~&<`J#Bn!jdhHfz_g4lY1DfFD;-GXPq83-Ap^$C=owKGCQC zX~T*Mq&F7?D~wGg82sODX|GISfwaj z$1C_6ojDUWoZLx<%}v@_#5dm&@@TjP!>{5BtcDO{U`1`$m9Np5AlR6Xfj+GJ_7glh zj~tl++Ycn$y%M%;O{&feL@=?XaJ?hjSd;}j5URQ$oB>}wcKkRzE7=WB4Hv=bng~v}RsH-3&CabDElb zMcI?Vbwk9i7uUXoA31gm*zF%6WRQ0px9Jp9?nGrfu3i4X4EdH+5DMB9u4f4$s0V;o zxhklju!ZSUg0{1u+QA&(!h%1T`=dB{Zx-s&sQ(~4BN7_m1r75K6czx&itT&7ArbZk zHrps}34h<)hl=z5K!|s)^@YbI{TCr-4T=MMMaN2aTL{GG7gNrDbj`&?f7C1=E_!GKI zWA36V7MdODCK*}%`LnRzJ9T}1PuL;hf0u7R%I}JsK$I|4);NU#Q4mE}6!=s-f>V|lEcz}?i8-k>yB)SCH!RHPE z93ESGD89m3aqWcq@d8_d;$JjSf|n%<<^|v&W9neC8WlREf$;$CI0_}mh|@enl~LZb zuo%YQK|i>D?HaL8h(tji8A-|T19F{UD8N-fd1O)H7W8)YKFvv!a@~Ne-XevDfwO(cV1{Hf0j!+iq{$O_% z+&Nr`g2`vp3`8VkWMq#0yKs;JTTn{s>1R{$V|T+@4tWun1t{I@Q8h?v50Ua>qS)|a zs>%iTp*D(-DoKoxstw+!mKI&k1aYv<%+Pzo1K=K&m*b&ZC9Y8pi-R%a)gSn4g~I&0&RsH*PcL(WB{2 zO+0)QVq?FF96o%K9F0*N3%fBOH(YPvH7~Uf#h;6L{1}Z!Fg#Au7|Wjab~pI^;w3Oi zv>xunANn_C&WIz`>TVeNuP>Fg5PSBI6!IPMOlCP6{0sQ{h{RB_PaoXXox5G}^!W^J zo^diYr8C65IZATQe47O;F;?V`m`rGR^#Lpxlxo?V}+l5Gp@YCG1C5+`Ou;+k-K4_r5%Qi*gp&r95{P8$6r?jlH9p<*C4|B}mht;6d;s5RbrowG9cRy^-ez z8 zu%y%_;P1A%?#@X=4(Of0p@+noK-fVbZV}jD@3Rqag?_NX-*(r3em>AI0-?ztH$>n# zfSV)ukrPoIg4418KksDu_fCuddS~u0g5TsQC|>x-{EbeDfh9{&4~w1DVg@l8fd0eu%tO=onuH*E zs-kVKuC5TIB)__E;3Vz+BlVQ_Ax6$mrfM>Dbcv+0g+`6n6$zebaMVfhmNvPD^30K%Cl!>|4)|zLo33uaPD*^!(=PzGYBL}gC zBha2r!T$G05Tt08g_xl1xck`@LJ@dT6M6e^$;E}7mXQ(S;a=nd*aU#1E0aH{H3JtP z3IqgC{2N0-jFZnBy$3}5CL8@sSPXy)a~qZWz#4f!WHnEopiD|79{V~q)y&rR7xFx8 zI6fPzd-v_b=u$boxzxD31TqekO}^FF6PsRumJcy;eZAQm9SkG*Yeae(VnaXxNlvV< zOy}e^1Cf}|rWErOgjJ#8onW5^`1^m_yUxi4>Hs_^SFw1|C+CE- zsYZo41Ye0LOx%Fzh!>0750Q9Whjt+N}w;tw;D}vnh@4B;8jU*d8`9CSaOGM_cH8qklGWS>| zzJnwSRcE_a@gB}4T`VcT;Hth)Sc=fr2L)l6Sz#+%RFtiocLO3lY!WJ0+7P@gVK{(` zJPCXlY}_YRRI=c#2}U>sJ79ByiHc*CN(>4L%UO~N2Z)hG8570@)KWaD8+Y{+zHaZ|J;V`tgXuHe^=hq7CesC8j+qn}?y@n@F1Pt$!*?|~0oQ+Ao33U~m zg7K2W@S?|bEm=E*1UJf`w~OWJSsWbNK_un#7lO^_+p*BcbOyyod{qZ5ERJ90+w4>0F|9iu%D(Et;-g~JTi z8+TFB_W;CM9(8N$tk~EtzilK)vWPT2aQSeo!Y1!y6{g9^zyQ%Y%<>Tx1>xx#!78yh z+G>S!0+xtM4DX)bNM*NI-C9T49)}+M2Gk%yZbBg0O-ZSB>C(X>G5>{$E?3+i05{ON zQSR5mq=(cHA0(JOM#qb%_TyCYtT|FCOSsTXOuSA{U&5IH;4XUJ{n3}XZNnN+p_6Z| z3*`$?7fdrDAtC&%)j!`@@c}gdK0tZa92zPtb<79|)!^q1Gy#)b+3CW=FtrreGA}^N zL!=ZgPbcA|1y}<~fufV)4kDP=xrCCoMj|*!2#rr4JwnLv8E)h}<@xTm3K#L<7D56h1L0c_G=w;8veEF#$|3;7 z4?Cnb_>)%)9flyJgS1P9`QRbq?vjJGA0wVico*u5Jfz z1I-*DDg6qMdGJbc3!Fqq>sNKae{Cv8kfN1(ul=m3=tC65V?m4$3a0r~h1Dk@a1&`N zc0@S24oq|k)~fT+$)*!h&m(pi89i4|Cr7IP_O0m~9Z1-EB*yY%wa%ZH1RQ`vJUG7n zFqw^wJ>#UR%sWWLS`)h}Zh3)2g9Q)YMKn&6IC6xuDB122{K9cgLbOG4Itw@LCJm%` ze0T1hjKG3le$m?2h7vwb3~u-x*hy4k=3XN}dt&~8%nslfG(QFw7XG`sK>n%Le)`#6 z(bdIM)dv9^d{VW02Q-|d2%LdffFcG35Xgvua|OHmt3E41_m2j$>Tv2L3_2T64m^1Ll9kkynPAWI@ zFw(hnarUTb#8B*sxs&1KcQbj4^1nYldL5$0D_Co=?iP{ScIjVsN;_vhH#>`lPDy}b z$DXM?`0pM8KZQ~GNeUORt+zVXQgigXr=yVn?wquM2(To2B+e|46(Lm*-H~3a*x&N=CnV13 z?g0y0*S2H2+xi%5HS^lFqfGMm^}Mg0g~dIw3l|Wkrq)(NxF$275M3uH5jB!~LF7I9_R6eQP%{->ZcIC?I5&-hI35RCIbOsfj-eqPPh*KNveqV)_vk zNn#fE5>Nwi!|5&aewg<(h5x}fU-wG0HXko<-(bf-ohIJk!* z#e}tqszg}hoH-^Pl)?VwdX1%!7?bYYi4uV$HbIp(E;Tcc-6U&%3WxX4C&{fkj@}i# z{-fz;t=TMAlppLJGs=fB;hW#(rB|T9ObGTZhCrBs+*)XLs97mDx-s>y}*f z{JRPoL|WuP#lBmcKxD`T2w+D!jXG_P-`T~Awns!Y_Hd;`E$_$vZAT+cD@*2O8%cjR zcZV(vzSv83hs$~R_`-Z$hFsr#Q`(iho6S_bG7e2ikJo$aSC&R> zPj57bDW%mcj8DDaP!eFSz7==z`K`6#&YRb4Ru?8~?}ZvXK#N+H)&D zeKJEl>-+RoYwelQUv&ZSOv+u{x2R~S3F7&A8wGJ_)_yXA=v%WrJz&hH}ti1wqtr;sGF??uWMwp&ys+@fy&E5-VFMn$L zuq?gQ{JvwSQ{K=KE{}(PE2gjF{iBx(z0P$-{ZufN@l*R5>zJ3EG#l8;#PGJyU*_>d zIIk4aVicI+&d`v(`}0u#LY>N>R4<+b&rhJH&#_fNO_fZwlDRsWLB-H3=S%wRe0$E8 z9OK06yC{S{X8%`5r!MuMEfT;2D6FW&3rCCrZ-()9F}?A~$H zbbPW*tnu^5$f^Elvl~8^+KzcmJP|ld6&0=G@Tn?Ig|IWyV!LS#U>E%m5sg1z*77Wn zGD$x`P5H!BMJ=So(;+cTHZw%lopWpKZNO+uhMkm|jp6G%L1jnj0|W~mSJ01yJT90> zuj@Q>B8uuX)q!=(Ggqcdwd2=5yka|5d#oT;#PZDn3QleBsqP_Jm)cfGi8@;4kkQ;R zyl30$HJXx~vfp!Q<`3U_+2Mr%p69>v!mj26+9!uy6 zG+B$CI>8&&)7=dfkIH}l{&AeyfaCEDCvW4M>uo;FMpEkWfL58jd7hjlCEPMub7Sq# znSlOiFaE*bEGHgROi{-jbr>qsbt$4gr18zg+pA-uu+wb6=2hmxuQk7?m48v(a3Abo zrKUfjH4@?2=2n+CeQztiP~kV(?g=%8pDZU%JqW)QMoC#Xajug(A$UDDk~b6p1RNR$ zfW9s6X5=RB4-l4{o=5q#6rUa^~HG6Rzs3kzFyLjLvxwhCE58j zCs~3{4>iy7Go9EezEy0zZ_?))gVkhw>GJb1+Rc&G!k0>dks|M+1j-ktT20wFOQyee zoOJP-9MvF8Jb z5}+wYP6et+j}fqD02}Z;YYOtZ@ZUVO@W2G*XQelvNEOvTeQ+*uZ{BYBemL*SHGBpG6F_mXlkhY|8YKw*jw za25fcD;4YaxJQ;Tsh{4WYy5I(eWRj+BPHKdzre3m5%-IZBuDuaKKNDH(QXbJddYd* ziu%6(W^-QfO7D@=2|2v{N7&;!3Jw-D_OuncGw^))?Y#|by&Ku5ESegVjv+QjeyS zUgQt16dpY0a((e>sc#Q?P`+B!Szs7Ag#)0QU_T^6x2{j&7m3PO1f*!8D(e!*mL_aZ zdNgEQIo-cy&hc*og9!U`1~S&$TjCn z%jkxvV|`2=#nEqks-P?gblR0LE-bkY`Ff(j&JzCX$;~;>!42<$XFXb0N1mx16&L?< zYaQh?u6^b7D|yP_B?yWPl~q+ZWazC~;>3%E{aN>3uDycM#v6&AalQvbtf|)TFAhtM z)UWZb?2M0pG-W{-;>zm@ebP^wVN-T`_p)t-KnodPaRr5|D2~T1kW>AzDoN_3OuRd<@b`Uq0HLco~~}GA*LV!9A;y*T!7z6t|!tPS=*1?L?$Nm)d{L zSdkKEmLDVYi}y%+!}Hgyi|tLO1OZ?sc7ZM-_KfGS*!F%ke0D^OB{Gt27LjTxZ*+ui zeV05D5q|sxSM%Vd1+vKa(lJ~4ay^wi3Y|4es z&hX-}lgAZ#9+%2DbTooQjX)FM&N*=xleR}@-5qnK^;Ibf-hcPaFBBenK58z*ORfJ* z-sg64X2BF!tp>%bAt(omZZ#)G#7Vri%nKXKX-X6B^nqOAE}7LynT9aqdI~gO{9aP_ zRn({TWnY*r!@z!)2!5_ov)#lp!TWdxK~xj|$am2_&HmB#)3>^p>TK;o&N;TCR{rSS zhfB-6emq-}=Zidd{K5TKHUGOpV76fU|ARtc2P{(P4HG~d;)w73o`AT9R*+2Qa<7x- zC!1r#r$_zQAp_)|)%YlVvgiYippej`muWsoi;Cm`#!~1$`tB3E5*?r}7ap!%zHSzJ zjXwALgU;7g|G~7o+?+`90BE1^%>$#4v<7fV@0QQqpA@%$j z{AZ;(EmP&Ym-eGi5?)#+RT;)L$ck$}x=1BS6Bd#0N;t9EHBY)S7{cX2jcHe0GoZidt zF(JJ&r+(t#-o1!#TU?(w8oT<9z9Q*5 zJUzKJ@P8w<^S#z5nvEKuOnB;4#>NT#+r&!NH_&HHSztIw#TCZU#PFW1;I-^rgLZ`f zm{Oig>2D66XMCljG_qt9LKPj(P7a_rD7Y@(+%3%LH~>?fa?mB1D(R!7YKF-;K0a8AsIJ zG;Zn+IM44gEYMB*QQ+JTE-k!$(E(Fcf;70ws;YW{p8-{Ht}u&j zIFX4#$_iH`&*|UWOc;M>eO>>%WkdDEmFr%rf_wABq!Oy+?Y7%rNV*JMZKifyYIst( zW%M)MgXR_2TEz_MMe!1so`SEJXgE76XfDroyZ-$!QNHvdy!QIz@(B6d#m43w4TiCq zjn#;2+F|}N-#gi3FMc+SE-lNuwip>~CeQvS&0#DTV788~?hzK7i7ajuY&oa&1O6vy z6bSIl6pV@fr&dLlVe|~)Dvz(U=CR2S{UvEysaLB$nvnG-@oR+q^7RkQH@Y2CzTBH_ zEIkoxtN2~O-D{q&A?c2#gEnPE(oJz$`{vfRUCeatD@&Vk;@$7kWgnltINag7Jg;z% ztYb!x)pnerEJcrUuWKE5h!Ku z&ai#oa=0K(Gg4%)LShV^p;}UUM~$v$vO;d`Ruov>1U8zJ7cV*jTjh#RqhI?8s+E>0 zc341~C?^doKcipZHIC=T6X_}UW#^m+5-bwgyy*I%W{C`2C&wFESy`OOVWg(+#Q81M zi5NPuHZUIWXh7MjSds<@ppU&MiIdT=xj_gEx!eh}SU=}TC)@p(q10O(??5v;JZ926 zR0}s%sjG~AMHL_6w6Qr`cNipco8Fz3!Q2g&!v9wdVZO$fSo2X zy5iCHY%krHb@hd^ZCcH?8ZGHeV=;Pth#_P@Y#zoLU3@NQ?gw%q7C!Cf^-S-1gRNVTSD3XsmZo zyraiM-i!+g|C5IuSFn&LyebqqP|vJC!SJr6VMsz(q3dnSYPuC!V~vNS?PV|bG!c{c zq(RmNzt3HlNeTUi-n2;PWy+k2xV;zAc(js#qRwwSHdx{%{Apy{Fu790VW0Jxldv@D z)VmNbe>7o3{QjLL>-;k783TsMfvitg>lyd03eAn^o$=YN8@i`ler<(9gh+yXd~QMxyZXU}X$oO0N3kP)a`)i^ER~b^PmZUY`1c*)DeSM)(As>k4J4*Mj{LX?OsCsv{B2WVc_rlv=M zEnaAe`ftkh^2bf%WIzIW(366K77N?Ro2LV5iFjlyfPpj6KcUV6=c4qN?!;r<(5BmcJbn6AuY^zY6dP(ETLv zvTN6xLqCAtXNEGNa!%$h>~ZSH_C>g$1tZJ{0Blg*(ph8oWm7ljGc1bxoZj>{s%=-I z*0(Esxx_)`7ZzTaoy9_0>)qPqOBS$CdSS(>v?fI4iB!Ww6}&$bN108{xIR0KY%*SM z`Lti~T**zB=k*@m<1rfXhrVUvKQutD2%|5@rldd&3Dbjeb zIc~AUYW9ip^=TF9W&;2 z>Zodd4|>-gZ$rHXljzdW)w=ZEySQ(Dk6d5p*>-Fr;nt|xv!=mv)3<_87t(Y;Y{iMG zT#4`Qmm)10kK+qysaoof;VWH^!LnWZtf%y)ed+gr|LC*+!8}K@nr4l^hJu!h^OxqD zRJi&1)!|J3WNZJ*&Ek8ZUcJv2gHdhcp^N$%(ZCPlN&OSPG z?}470Jm0a9+A+)9|IIoUrq7qYV9A<7QHL@puxwwf`U@J3j~drN?D4k?K|2ea1na(j zB_cZDfWG-gwHDy43T7PeF$85IJ(K*pB9RK^?kh@AQ7TybUW!In@NL1ygtE#NSS5XZ zqNoBH_tc+1aFH)SyQ$`^?H!7_L4#a*Rw4_6o*VIsBQP}dl(n@`*k=cO`|eR-@WuVTCmuQ?g950$N#*sSOcX%xwctW%Jhh4u5t;iZXvxpHG&gaO$e z^y*XyF)A%{U<-|A{sm?Jnz6H#ALP!qs!f*Lxe-I@) zeRmJPg$oAkk=YSr(YPL|<%{N#iSl$Sj2M9nWpSdlx2)_sMv3Sy;k_o_#CYl$Z&V?= zM?m9^F9#k&wqpmaVUxMCuLl+JG>iiJ24GW?85+vr5QDD;=NkNCxGUdg-Ce1LrjlUz z;kb%T3eN{6xQFUmn-zLrw>$_=LB++MfYLlWGrZRBd;ecQM-c`XU*? z(1K09FxeA3oV?@D(9k8YST}D){7(-16O!nC=w^gMFpO)gZr&^`C~))ec#)R2N9)vw zFGCI|l$Eh{eSm8+?$6xJYMKzy;e zd2=rN{}f85|q4va(vkTHeJf4#p(CVT=xu2DAS-YSd^08ZYN5e&k3K z_`#^&fH4I|Kw*J_EqD-ccgH$}jSbN&7=0Mzun-+1;6xgDA58P7PdgwV0BNt&JIBcr^B=Y8h57ktAzf2*^B6=cCMI#WX=9K^ zh#WzVJ)+c(&dw2VL4vjlE=nj+Vb3gv>kLFyw1&?LKmiC23i1XC14Ja;wIN6-U`vCg zkL)y518XC&27T%t3qYp+5@o_>4FrpuaU|WPgVLTuv z0IDc#R3a%TIEELXHm>E%2W`M@CpyE;EPfm`a1kdlIL)9^F1_qJt2zEN`dFLm4^Z`eLuLnjT0+@ydjPF6nAYO+K<-hss z0{#Tfx7Zgk?AYz#^M-47ZVrF=&*~_~0ym<03g{kpkdiV0rV|L*Yq%$k!(3FPkpq>6 z3WDtl2LrJJD1k;|vauX7RjCq0M7N6tKL&o)A}EclxPbMTn)+$Tp~G4{LhTyJ*y26Z z5o*owocr;kKVA|mcIKHINF*p5sxEyPb3u1?dxaRPpXQuiOoW+6@KKtZ9bH^(7p22K zufiQ0KN&wA!-yUlvEOdI(E-|q*^Z$8tI(0}j!8+G03Tnqu_yx&YDPxS`QdHf9`W~1 zo+Kd@!IC>OBcpc7KLbXEm~D7c!kd|80>nBo{cY?wV%T&PrT09C^TrjTm(1@jE*z-( z6w*1mw{0UR7P$50Ro&S|-xZGClR$XJ?XV$q*z0Fg(}fU5ke*fc4D04Z(q=WxR+=Y) zwX0ZNv8vk`k9i(B6?p*7KfJ1x%H9v`QO0f*9p}L_w+yOGwl$f1UnN>sU#*x_?em|; zQFeBgZdjP$WQan#kSqGEZ4K-rl6pJ!g?8*Q>1Z257^2~vaW(}b`%q{pel*|WCViZzg-%?;^#kNIapAH zF?0w~43uP}iK=B77H%Z|x_600qPK+-?v2nyw=6{}5>w~NvaL!KEbS$hGMPse#4kVm zV_@aLS!1~$9Im4bUVQ!oeQuIQsnv5m?76J1y`_iXRo0bC(G`sT59q>t zM**CF;xa9sDzkD^)zq{@Kv#LR^BMM0Mc1GKH)sTaVd}{kgMX=?Z@tI-l;2?;I|BY6 zk>hP_m0-U|8I0)uEV6eyzk~Yu1!PyWvHY@h2J)5K`g#UOIlse_VD_>?+Jqi>c7-p) zPN#t{8h`(x;u#Fh@GrO`#@d+r(}L;-swr>kfkrw+ltj+!zSXp`d z?Mt`6{D#Biqy{pEFJJDctghyr<&8q~a0(#`F%oYu)DMHPK66k{BxY=4Lc>@taqn;* zu*ZnvL*SgcxWJ%@C-*=j^*r>ya!8xN0~71sMMX}|k^i6nno7h-rY^IDs?1JLa}=>P z?!Ze^Z=<AwoA+Kgez1Yzh4* zq6rGV`lsj+5fsSC$jpT8DQ0L^wyk<>cGm0bj*`MBiSmvWI&Th~)c6Eny?j}7UhAQ` z=pYL<0tBe)h$#4EghX8L54-*StQdmX#Bmwh3CNDM8(hRKX`*YXqeE(C<$Z+9g~yx^ zTKE|*QIFjoaA1p+4NeKa&!+4^F2I@Ucm9N1xR5eDVw8mfzjL%q=h_@p-c`8OO5Vz$ z9eSQr`8e?pr_+8K>j>O=Y+ra#=<xjmR(+Y;v^BfO<6fg@;M<$;3<766teIlMXdgyBI3g}8`X*O&q^s@I zr&N%=vHc*P$7jxWZHJ&E$t14>=>$4UfB|;q^jWXP`H}RmrvvXFdTbeP^5X1^Bg(ww zd%07C?^8Cly3jHw?Itf(V;6#1r`Z zf%++iull_7oT(X^mVbZ$9v-*0vx83UxA@yz(JxUiN!#`RI8_M-IGj zTFotz_6*#~*^<-rx!LaIUgZW_5C8K*cjP#t!aA%Aw?4QoRa|IsWvlLWE%rR=K(T{d zGib*y4RT`*YKj*;+cH=xRRrGgO9~%KO1m8F|LAFSQ|B9U%DdYS{=At^E@t`QjrKN* zo&38s^tkWy8`Eh$r#r25f@83zsz-kQ-hAAdZQp{5g$mb7Z)_EGb!}{viMl3qth+3I za9MlO(sFC*S8P<&G5Fb8@i9{E=Q1ZvIze5vJm0JdCF+F&XRDc$?0buz^VaFwy7Gnx zS>)9fw`wa>u=$mWcioJ6rD-M{?(>E_Kj|J?Oo1aq+0~5bKTkt&ISGi??{^yWG zF1)vDFf0(&efV6v^;G}x@sInANlISuU$R{)u=DlYOjbqv4yv+ZPP}LPxpYS&l|1Hf zdg02+%vH%xPYt!yj~Gh~p82s>A}24|ZJ-J?fVc>=jUbZy`%-8*$AEGu(WrP+v<(^K%&?cHJ%4R>ZX z@9g3p82Dh*b;Ctf;tX2@CFftjX3gp6goOoklQQpxQBl~MF!u?(djC;08crQzoxBd0+T|vzwacDCCTky)_t5MqW{5Ud~morwe@pF6Pf`UzG zRX$3)eII|g&c&LaF`6AOhi(5S!!R2E|<#DUG^2{bNv|e-Q?wVxZfwpaqRTj{BWtH9txI9duKrm*~`AEaJt+8+E{!W^ZK&N)Xoi-yIo9tlbK9}4RTnz8* zt_C~br+LuGs@10}Ij}GiQ8b;yEcQCNs0g_d!G`Q1CwaxWORYvU)%!%Z=hg3Ijk{U? zmeiw)L6Y>fJ%P@ILCizkjoHlYfQS;6b%4)w;1#MfO8(`Zzib087}@0%mr#66DEVnI z&0mwdUF3wE5b0i~t9e%rSqJjI45Ba&IGUq2t9Uym>AAI8*`R2g;7qmP&Q|`qC>+@spD;y8#-5oZyl-=w@kaATUyxtUIJAmDoLUQ_SG19GCjx8*45Q=Rp@ z+}`9-S@^3h@3kn4nUP|dCdqkYNF__E=)5Le zRbupk(c0~o5uq-Ey|HUXWWAqKMNMT#VjqD{N^nOxx<|wcu}^lMo8#QQvx?I`U|agf zKa56-((4r8D=rkUM=OiFT@C4VIngGQOP>GH&g<|sf{tu(+}4MZKWDgi8c>=s?{}@S z+~3-BV+S+w15w7z3J=%V=J_j=gxaE*Z~lttHm7zHCLO}9=Lt`Xy=%S)4LxrQ&82oo zq84$pxprG6_v4(sf>3~~SJp(aaS+MGl}#$P+~ycHq1-(0jl(CX#nv*yXflI$3rELw z8~94F>2R3`-JfoDpUWTh&^6wtxoy|ozM~AvJ9RkeopfmUl!7l#Td!TaWPE(w?}kN? z;rZ>Zm40K+oK+qy+Z1*t(9}ErK2j6x^^#2D(MRehinDCJ6gQgs`*SC#yn4~(`Fv0E-*>t^zh|MB~t zy+mNZF13$yjpI@~muy`a3i_0Cj?2kzvF+L|v{&VY zH=B1ZKmGh8lzz!SbaPk7U9R1G-;VUS78}-ZIb0?e*eyH#u`5=5zd>kyNvRy)gI z#q#=QiisGH@>1+VYL&tHUBR79#ieI2#fk}Y2@ww~j_b?7zkv8VkF6D~s}5#yEC)A_ zz!AY1rw}Nf_|no6E?p@;E^P4dPMw2)-SKYGJ14gDC`#;qxGtexaZ8fs@k1wrXw5={ zB9nOs$%kMrHvfR00T?JmlCAsGTPXP-cVCCaz_jvJAvTwrPbB7`iFvg$zzM}eW)UG=SUOJrCEo+hi++XMua|bxM3Y^*iZ4VU|22S2RV&B5f4D@b- z(e~T5pnZqw=YbpQfB-Z``1a`0^Y-hl>w$geeJgkFT)Fh=rpxxh&rVGekrcb8ZDeF) zv&SDey2sA*!aTtNn9Ua3vH_1|2gbz2<#j4WVFJ5&mtVel<;s&1E8ynOl2~@&?7{3= zvu@dih5}ojtM5IZ3mlWw5#vtSep{B$9eAU};fEV;<}4GDKLu<*HYu=#b;O-h<7m2& j3gJ!z2AV+2fnWNz=^x#B-I_~*<1!4Mu6{1-oD!M Date: Wed, 29 Jan 2025 12:00:01 -0500 Subject: [PATCH 04/33] Remove nvks runners from testing pool. (#3580) --- ci/matrix.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 6a98e8fc5b0..c3f03d323ab 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -257,13 +257,13 @@ projects: # testing -> Runner with GPU is in a nv-gh-runners testing pool gpus: - v100: { sm: 70 } # 32 GB, 40 runners - t4: { sm: 75, testing: true } # 16 GB, 8 runners - rtx2080: { sm: 75, testing: true } # 8 GB, 8 runners - rtxa6000: { sm: 86, testing: true } # 48 GB, 12 runners - l4: { sm: 89, testing: true } # 24 GB, 48 runners - rtx4090: { sm: 89, testing: true } # 24 GB, 10 runners - h100: { sm: 90, testing: true } # 80 GB, 16 runners + v100: { sm: 70 } # 32 GB, 40 runners + t4: { sm: 75 } # 16 GB, 10 runners + rtx2080: { sm: 75 } # 8 GB, 12 runners + rtxa6000: { sm: 86 } # 48 GB, 12 runners + l4: { sm: 89 } # 24 GB, 48 runners + rtx4090: { sm: 89 } # 24 GB, 10 runners + h100: { sm: 90 } # 80 GB, 16 runners # Tags are used to define a `matrix job` in the workflow section. # From d0f254490bad268887e33266dc64a0722318ef30 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 29 Jan 2025 19:39:31 +0100 Subject: [PATCH 05/33] Try and get rapids green (#3503) * Drop unneeded mdspan macros * Guard `cub/detail/launcher/cuda_runtime.cuh` by a cuda compliler * Fix formatting issue * Fix bug in contiguous storage --- cub/cub/util_device.cuh | 4 ++- libcudacxx/include/cuda/std/__mdspan/config.h | 26 ------------------- .../mdspan.mdspan.cons/ctad_c_array.pass.cpp | 4 +-- .../ctad_const_c_array.pass.cpp | 2 +- thrust/thrust/detail/contiguous_storage.h | 2 +- 5 files changed, 7 insertions(+), 31 deletions(-) diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index 498f17c1259..fd356b8f9e5 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -713,4 +713,6 @@ private: CUB_NAMESPACE_END -#include // to complete the definition of TripleChevronFactory +#if _CCCL_HAS_CUDA_COMPILER +# include // to complete the definition of TripleChevronFactory +#endif // _CCCL_HAS_CUDA_COMPILER diff --git a/libcudacxx/include/cuda/std/__mdspan/config.h b/libcudacxx/include/cuda/std/__mdspan/config.h index 9f1c9898dd2..4cbeffc18c2 100644 --- a/libcudacxx/include/cuda/std/__mdspan/config.h +++ b/libcudacxx/include/cuda/std/__mdspan/config.h @@ -245,32 +245,6 @@ static_assert(__MDSPAN_CPLUSPLUS >= __MDSPAN_CXX_STD_14, "mdspan requires C++14 # endif # endif -# if __MDSPAN_USE_BRACKET_OPERATOR -# define __MDSPAN_OP(mds, ...) mds[__VA_ARGS__] -// Corentins demo compiler for subscript chokes on empty [] call, -// though I believe the proposal supports it? -# ifdef __MDSPAN_NO_EMPTY_BRACKET_OPERATOR -# define __MDSPAN_OP0(mds) mds.accessor().access(mds.data_handle(), 0) -# else -# define __MDSPAN_OP0(mds) mds[] -# endif -# define __MDSPAN_OP1(mds, a) mds[a] -# define __MDSPAN_OP2(mds, a, b) mds[a, b] -# define __MDSPAN_OP3(mds, a, b, c) mds[a, b, c] -# define __MDSPAN_OP4(mds, a, b, c, d) mds[a, b, c, d] -# define __MDSPAN_OP5(mds, a, b, c, d, e) mds[a, b, c, d, e] -# define __MDSPAN_OP6(mds, a, b, c, d, e, f) mds[a, b, c, d, e, f] -# else -# define __MDSPAN_OP(mds, ...) mds(__VA_ARGS__) -# define __MDSPAN_OP0(mds) mds() -# define __MDSPAN_OP1(mds, a) mds(a) -# define __MDSPAN_OP2(mds, a, b) mds(a, b) -# define __MDSPAN_OP3(mds, a, b, c) mds(a, b, c) -# define __MDSPAN_OP4(mds, a, b, c, d) mds(a, b, c, d) -# define __MDSPAN_OP5(mds, a, b, c, d, e) mds(a, b, c, d, e) -# define __MDSPAN_OP6(mds, a, b, c, d, e, f) mds(a, b, c, d, e, f) -# endif - #endif // _CCCL_STD_VER > 2011 #endif // _LIBCUDACXX___MDSPAN_CONFIG_HPP diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.mdspan.cons/ctad_c_array.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.mdspan.cons/ctad_c_array.pass.cpp index 930a772b596..9288d444402 100644 --- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.mdspan.cons/ctad_c_array.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.mdspan.cons/ctad_c_array.pass.cpp @@ -32,7 +32,7 @@ int main(int, char**) assert(m.rank_dynamic() == 0); assert(m.static_extent(0) == 5); assert(m.extent(0) == 5); - assert(__MDSPAN_OP(m, 2) == 3); + assert(m[2] == 3); cuda::std::mdspan m2(data, 3); @@ -43,7 +43,7 @@ int main(int, char**) assert(m2.rank() == 1); assert(m2.rank_dynamic() == 1); assert(m2.extent(0) == 3); - assert(__MDSPAN_OP(m2, 2) == 3); + assert(m2[2] == 3); } #endif diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.mdspan.cons/ctad_const_c_array.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.mdspan.cons/ctad_const_c_array.pass.cpp index 7957a3cdd39..04656311604 100644 --- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.mdspan.cons/ctad_const_c_array.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.mdspan.cons/ctad_const_c_array.pass.cpp @@ -32,7 +32,7 @@ int main(int, char**) assert(m.rank_dynamic() == 0); assert(m.static_extent(0) == 5); assert(m.extent(0) == 5); - assert(__MDSPAN_OP(m, 2) == 3); + assert(m[2] == 3); } #endif diff --git a/thrust/thrust/detail/contiguous_storage.h b/thrust/thrust/detail/contiguous_storage.h index 2391a3a968a..359b45c1ea2 100644 --- a/thrust/thrust/detail/contiguous_storage.h +++ b/thrust/thrust/detail/contiguous_storage.h @@ -131,7 +131,7 @@ class contiguous_storage } else if constexpr (!allocator_traits::is_always_equal::value) { - NV_IF_TARGET(NV_IS_DEVICE, (assert(m_allocator == other);), (if (m_allocator != other.m_allocator) { + NV_IF_TARGET(NV_IS_DEVICE, (assert(m_allocator == other.m_allocator);), (if (m_allocator != other.m_allocator) { throw allocator_mismatch_on_swap(); })); } From da97c370629e2f7620e47efc078071cb80d2b738 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 29 Jan 2025 20:36:26 +0100 Subject: [PATCH 06/33] fixes return type of tabulate out iter (#3573) --- thrust/thrust/iterator/detail/tabulate_output_iterator.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thrust/thrust/iterator/detail/tabulate_output_iterator.inl b/thrust/thrust/iterator/detail/tabulate_output_iterator.inl index f9b740bca6b..56c093ff56a 100644 --- a/thrust/thrust/iterator/detail/tabulate_output_iterator.inl +++ b/thrust/thrust/iterator/detail/tabulate_output_iterator.inl @@ -53,7 +53,7 @@ template using tabulate_output_iterator_base = thrust::iterator_adaptor, counting_iterator, - thrust::use_default, + void, System, thrust::use_default, tabulate_output_iterator_proxy>; From 09b12009d906bdb69f9da60de5196991d0610f9e Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Wed, 29 Jan 2025 11:39:14 -0800 Subject: [PATCH 07/33] Add `__int128` and `__float128` detection macros (#3413) --- docs/cccl_development/macro.rst | 22 +++++----- libcudacxx/include/cuda/__cccl_config | 2 +- ...floating_point.h => extended_data_types.h} | 42 ++++++++++++++++--- .../macros}/architecture.compile.pass.cpp | 0 ....cpp => extended_data_types.bf16.fail.cpp} | 24 ++--------- .../macros/extended_data_types.fp128.fail.cpp | 23 ++++++++++ .../macros/extended_data_types.fp16.fail.cpp | 26 ++++++++++++ .../macros/extended_data_types.fp8.fail.cpp | 26 ++++++++++++ .../macros/extended_data_types.i128.fail.cpp | 24 +++++++++++ ....pass.cpp => extended_data_types.pass.cpp} | 24 +++++++---- .../macros}/os.compile.pass.cpp | 0 11 files changed, 169 insertions(+), 44 deletions(-) rename libcudacxx/include/cuda/std/__cccl/{extended_floating_point.h => extended_data_types.h} (54%) rename libcudacxx/test/libcudacxx/{std/cccl => libcxx/macros}/architecture.compile.pass.cpp (100%) rename libcudacxx/test/libcudacxx/libcxx/macros/{extended_floating_point.fail.cpp => extended_data_types.bf16.fail.cpp} (60%) create mode 100644 libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp128.fail.cpp create mode 100644 libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp16.fail.cpp create mode 100644 libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp8.fail.cpp create mode 100644 libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.i128.fail.cpp rename libcudacxx/test/libcudacxx/libcxx/macros/{extended_floating_point.pass.cpp => extended_data_types.pass.cpp} (62%) rename libcudacxx/test/libcudacxx/{std/cccl => libcxx/macros}/os.compile.pass.cpp (100%) diff --git a/docs/cccl_development/macro.rst b/docs/cccl_development/macro.rst index 5cc39f0f0d3..6b832da0000 100644 --- a/docs/cccl_development/macro.rst +++ b/docs/cccl_development/macro.rst @@ -184,18 +184,20 @@ Non-standard Types Support +------------------------------+-------------------------------------------------------------------------------------------------------------------------------+ | ``_CCCL_HAS_NVBF16`` | ``__nv_bfloat16/__nv_bfloat162`` data types are supported and enabled. Prefer over ``__CUDA_BF16_TYPES_EXIST__`` | +------------------------------+-------------------------------------------------------------------------------------------------------------------------------+ -| ``_CCCL_HAS_FP8()`` | ``__nv_fp8_e5m2/__nv_fp8_e4m3/__nv_fp8_e8m0`` data types are supported and enabled. Prefer over ``__CUDA_FP8_TYPES_EXIST__`` | +| ``_CCCL_HAS_NVFP8()`` | ``__nv_fp8_e5m2/__nv_fp8_e4m3/__nv_fp8_e8m0`` data types are supported and enabled. Prefer over ``__CUDA_FP8_TYPES_EXIST__`` | +------------------------------+-------------------------------------------------------------------------------------------------------------------------------+ -+------------------------------+-------------------------------------------------------------------------+ -| ``_CCCL_DISABLE_INT128`` | Disable ``__int128/__uint128_t`` support | -+------------------------------+-------------------------------------------------------------------------+ -| ``_CCCL_DISABLE_FLOAT128`` | Disable ``__float128`` support | -+------------------------------+-------------------------------------------------------------------------+ -| ``_LIBCUDACXX_HAS_NVFP16`` | ``__half/__half2`` host/device are supported (CUDA 12.2+) | -+------------------------------+-------------------------------------------------------------------------+ -| ``_LIBCUDACXX_HAS_NVBF16`` | ``__nv_bfloat16/__nv_bfloat162`` host/device are supported (CUDA 12.2+) | -+------------------------------+-------------------------------------------------------------------------+ ++---------------------------------+-------------------------------------------------------------------------+ +| ``_CCCL_DISABLE_NVFP8_SUPPORT`` | Disable ``__nv_fp8_e5m2/__nv_fp8_e4m3/__nv_fp8_e8m0`` support | ++---------------------------------+-------------------------------------------------------------------------+ +| ``_CCCL_DISABLE_INT128`` | Disable ``__int128/__uint128_t`` support | ++---------------------------------+-------------------------------------------------------------------------+ +| ``_CCCL_DISABLE_FLOAT128`` | Disable ``__float128`` support | ++---------------------------------+-------------------------------------------------------------------------+ +| ``_LIBCUDACXX_HAS_NVFP16`` | ``__half/__half2`` host/device are supported (CUDA 12.2+) | ++---------------------------------+-------------------------------------------------------------------------+ +| ``_LIBCUDACXX_HAS_NVBF16`` | ``__nv_bfloat16/__nv_bfloat162`` host/device are supported (CUDA 12.2+) | ++---------------------------------+-------------------------------------------------------------------------+ ---- diff --git a/libcudacxx/include/cuda/__cccl_config b/libcudacxx/include/cuda/__cccl_config index fc28d63ace1..e7fdf26dcc5 100644 --- a/libcudacxx/include/cuda/__cccl_config +++ b/libcudacxx/include/cuda/__cccl_config @@ -21,7 +21,7 @@ #include // IWYU pragma: export #include // IWYU pragma: export #include // IWYU pragma: export -#include // IWYU pragma: export +#include // IWYU pragma: export #include // IWYU pragma: export #include // IWYU pragma: export #include // IWYU pragma: export diff --git a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h b/libcudacxx/include/cuda/std/__cccl/extended_data_types.h similarity index 54% rename from libcudacxx/include/cuda/std/__cccl/extended_floating_point.h rename to libcudacxx/include/cuda/std/__cccl/extended_data_types.h index dee553633d8..034ebc51b5f 100644 --- a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h +++ b/libcudacxx/include/cuda/std/__cccl/extended_data_types.h @@ -4,12 +4,12 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#ifndef __CCCL_EXTENDED_FLOATING_POINT_H -#define __CCCL_EXTENDED_FLOATING_POINT_H +#ifndef __CCCL_EXTENDED_DATA_TYPES_H +#define __CCCL_EXTENDED_DATA_TYPES_H #include #include @@ -23,8 +23,21 @@ #endif // no system header #include +#include #include +#if !defined(_CCCL_DISABLE_INT128) +# if _CCCL_COMPILER(NVRTC) && defined(__CUDACC_RTC_INT128__) && _CCCL_OS(LINUX) +# define _CCCL_HAS_INT128() 1 +# elif defined(__SIZEOF_INT128__) && _CCCL_OS(LINUX) +# define _CCCL_HAS_INT128() 1 +# else +# define _CCCL_HAS_INT128() 0 +# endif +#else +# define _CCCL_HAS_INT128() 0 +#endif // !_CCCL_DISABLE_INT128 + #if !defined(_CCCL_HAS_NVFP16) # if _CCCL_HAS_INCLUDE() && (_CCCL_HAS_CUDA_COMPILER || defined(LIBCUDACXX_ENABLE_HOST_NVFP16)) \ && !defined(CCCL_DISABLE_FP16_SUPPORT) @@ -44,9 +57,28 @@ # define _CCCL_HAS_NVFP8() 1 # else # define _CCCL_HAS_NVFP8() 0 -# endif // _CCCL_HAS_INCLUDE() +# endif // _CCCL_HAS_INCLUDE() && defined(_CCCL_HAS_NVFP16) && defined(_CCCL_HAS_NVBF16) #else # define _CCCL_HAS_NVFP8() 0 #endif // !defined(_CCCL_DISABLE_NVFP8_SUPPORT) -#endif // __CCCL_EXTENDED_FLOATING_POINT_H +#if !defined(_CCCL_DISABLE_FLOAT128) +# if _CCCL_COMPILER(NVRTC) && defined(__CUDACC_RTC_FLOAT128__) && _CCCL_OS(LINUX) +# if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000) +# define _CCCL_HAS_FLOAT128() 1 +# else +# define _CCCL_HAS_FLOAT128() 0 +# endif +// NVC++ support float128 only in host code +# elif (defined(__SIZEOF_FLOAT128__) || defined(__FLOAT128__)) && _CCCL_OS(LINUX) && !_CCCL_CUDA_COMPILER(NVHPC) +# if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000) +# define _CCCL_HAS_FLOAT128() 1 +# else +# define _CCCL_HAS_FLOAT128() 0 +# endif +# else +# define _CCCL_HAS_FLOAT128() 0 +# endif +#endif // !defined(_CCCL_DISABLE_FLOAT128) + +#endif // __CCCL_EXTENDED_DATA_TYPES_H diff --git a/libcudacxx/test/libcudacxx/std/cccl/architecture.compile.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/architecture.compile.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/std/cccl/architecture.compile.pass.cpp rename to libcudacxx/test/libcudacxx/libcxx/macros/architecture.compile.pass.cpp diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.fail.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.bf16.fail.cpp similarity index 60% rename from libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.fail.cpp rename to libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.bf16.fail.cpp index 3b6b457e633..fef41309a67 100644 --- a/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.fail.cpp +++ b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.bf16.fail.cpp @@ -6,37 +6,19 @@ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#include +#include #include "test_macros.h" -#if !_CCCL_HAS_NVFP8() -# include -#endif -#if !defined(_CCCL_HAS_NVFP16) -# include -#endif #if !defined(_CCCL_HAS_NVBF16) # include #endif int main(int, char**) { -#if !_CCCL_HAS_NVFP8() - auto x = __nv_fp8_e4m3(1.0f); - unused(x); -#else - static_assert(false); -#endif -#if !defined(_CCCL_HAS_NVFP16) - auto y = __half(1.0f); - unused(y); -#else - static_assert(false); -#endif #if !defined(_CCCL_HAS_NVBF16) - auto z = __nv_bfloat16(1.0f); - unused(z); + auto x3 = __nv_bfloat16(1.0f); + unused(x3); #else static_assert(false); #endif diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp128.fail.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp128.fail.cpp new file mode 100644 index 00000000000..5cebaa96168 --- /dev/null +++ b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp128.fail.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +#include + +#include "test_macros.h" + +int main(int, char**) +{ +#if !_CCCL_HAS_FLOAT128() + __float128 x4 = __float128(3.14) + __float128(3.14); + unused(x4); +#else + static_assert(false); +#endif + return 0; +} diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp16.fail.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp16.fail.cpp new file mode 100644 index 00000000000..d4925743db9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp16.fail.cpp @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +#include + +#include "test_macros.h" + +#if !defined(_CCCL_HAS_NVFP16) +# include +#endif + +int main(int, char**) +{ +#if !defined(_CCCL_HAS_NVFP16) + auto x2 = __half(1.0f); + unused(x2); +#else + static_assert(false); +#endif + return 0; +} diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp8.fail.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp8.fail.cpp new file mode 100644 index 00000000000..7418e0030b7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.fp8.fail.cpp @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +#include + +#include "test_macros.h" + +#if !_CCCL_HAS_NVFP8() +# include +#endif + +int main(int, char**) +{ +#if !_CCCL_HAS_NVFP8() + auto x1 = __nv_fp8_e4m3(1.0f); + unused(x1); +#else + static_assert(false); +#endif + return 0; +} diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.i128.fail.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.i128.fail.cpp new file mode 100644 index 00000000000..04dd7ea7525 --- /dev/null +++ b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.i128.fail.cpp @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +#include + +#include "test_macros.h" + +int main(int, char**) +{ +#if !_CCCL_HAS_INT128() + __int128 x = __int128(123456789123) + __int128(123456789123); + __uint128_t y = __uint128_t(123456789123) + __uint128_t(123456789123); + unused(x); + unused(y); +#else + static_assert(false); +#endif + return 0; +} diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.pass.cpp similarity index 62% rename from libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.pass.cpp rename to libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.pass.cpp index fa1476611bc..3b230d8710b 100644 --- a/libcudacxx/test/libcudacxx/libcxx/macros/extended_floating_point.pass.cpp +++ b/libcudacxx/test/libcudacxx/libcxx/macros/extended_data_types.pass.cpp @@ -6,7 +6,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#include +#include #include "test_macros.h" @@ -22,17 +22,27 @@ int main(int, char**) { +#if _CCCL_HAS_INT128() + auto x1 = __int128(123456789123) + __int128(123456789123); + auto y1 = __uint128_t(123456789123) + __uint128_t(123456789123); + unused(x1); + unused(y1); +#endif #if _CCCL_HAS_NVFP8() - auto x = __nv_fp8_e4m3(1.0f); - unused(x); + auto x2 = __nv_fp8_e4m3(1.0f); + unused(x2); #endif #if defined(_CCCL_HAS_NVFP16) - auto y = __half(1.0f); - unused(y); + auto x3 = __half(1.0f); + unused(x3); #endif #if defined(_CCCL_HAS_NVBF16) - auto z = __nv_bfloat16(1.0f); - unused(z); + auto x4 = __nv_bfloat16(1.0f); + unused(x4); +#endif +#if _CCCL_HAS_FLOAT128() + __float128 x5 = __float128(3.14) + __float128(3.14); + unused(x5); #endif return 0; } diff --git a/libcudacxx/test/libcudacxx/std/cccl/os.compile.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/os.compile.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/std/cccl/os.compile.pass.cpp rename to libcudacxx/test/libcudacxx/libcxx/macros/os.compile.pass.cpp From ced506dd40f78817047d8f93d785a2f898711d2e Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Wed, 29 Jan 2025 12:49:56 -0800 Subject: [PATCH 08/33] Remove all code paths and policies for SM37 and below (#3466) --- .../adjacent_difference/subtract_left.cu | 4 +- cub/benchmarks/bench/copy/memcpy.cu | 2 +- .../bench/histogram/histogram_common.cuh | 2 +- cub/benchmarks/bench/partition/three_way.cu | 2 +- cub/benchmarks/bench/reduce/by_key.cu | 4 +- .../bench/run_length_encode/encode.cu | 4 +- .../run_length_encode/non_trivial_runs.cu | 4 +- cub/benchmarks/bench/segmented_sort/keys.cu | 4 +- cub/benchmarks/bench/select/unique_by_key.cu | 4 +- cub/benchmarks/bench/transform/common.h | 2 +- .../device/dispatch/dispatch_spmv_orig.cuh | 50 +---------- .../tuning/tuning_adjacent_difference.cuh | 14 +--- .../dispatch/tuning/tuning_batch_memcpy.cuh | 6 +- cub/cub/device/dispatch/tuning/tuning_for.cuh | 4 +- .../dispatch/tuning/tuning_histogram.cuh | 9 +- .../device/dispatch/tuning/tuning_merge.cuh | 14 +--- .../dispatch/tuning/tuning_merge_sort.cuh | 6 +- .../dispatch/tuning/tuning_radix_sort.cuh | 83 +------------------ .../device/dispatch/tuning/tuning_reduce.cuh | 23 +---- .../dispatch/tuning/tuning_reduce_by_key.cuh | 6 +- .../tuning/tuning_run_length_encode.cuh | 16 ++-- .../device/dispatch/tuning/tuning_scan.cuh | 4 +- .../dispatch/tuning/tuning_scan_by_key.cuh | 4 +- .../dispatch/tuning/tuning_segmented_sort.cuh | 28 +------ .../dispatch/tuning/tuning_select_if.cuh | 6 +- .../tuning/tuning_three_way_partition.cuh | 6 +- .../dispatch/tuning/tuning_unique_by_key.cuh | 6 +- ...vice_run_length_encode_non_trivial_runs.cu | 4 +- cub/test/catch2_test_util_device.cu | 21 ++--- cub/test/catch2_test_vsmem.cu | 8 +- docs/cub/developer_overview.rst | 6 +- docs/repo.toml | 2 +- .../libcudacxx/cuda/test_platform.pass.cpp | 24 ------ libcudacxx/test/support/concurrent_agents.h | 4 - thrust/thrust/system/cuda/detail/core/util.h | 40 +-------- thrust/thrust/system/cuda/detail/reduce.h | 16 +--- .../thrust/system/cuda/detail/reduce_by_key.h | 49 +---------- .../system/cuda/detail/set_operations.h | 21 ----- thrust/thrust/system/cuda/detail/unique.h | 30 ------- 39 files changed, 86 insertions(+), 456 deletions(-) diff --git a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu index 6976b024d37..89e4bc485e9 100644 --- a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu +++ b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu @@ -35,7 +35,7 @@ #if !TUNE_BASE struct policy_hub_t { - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using AdjacentDifferencePolicy = cub::AgentAdjacentDifferencePolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/copy/memcpy.cu b/cub/benchmarks/bench/copy/memcpy.cu index 678091cb0c0..07162bf602a 100644 --- a/cub/benchmarks/bench/copy/memcpy.cu +++ b/cub/benchmarks/bench/copy/memcpy.cu @@ -118,7 +118,7 @@ using block_delay_constructor_t = struct policy_hub_t { - struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> + struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t> { using AgentSmallBufferPolicyT = cub::detail::AgentBatchMemcpyPolicy< TUNE_THREADS, diff --git a/cub/benchmarks/bench/histogram/histogram_common.cuh b/cub/benchmarks/bench/histogram/histogram_common.cuh index d6a7f9f9173..93eea3e8e02 100644 --- a/cub/benchmarks/bench/histogram/histogram_common.cuh +++ b/cub/benchmarks/bench/histogram/histogram_common.cuh @@ -62,7 +62,7 @@ constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND; template struct policy_hub_t { - struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> + struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t> { static constexpr cub::BlockLoadAlgorithm load_algorithm = (TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED) diff --git a/cub/benchmarks/bench/partition/three_way.cu b/cub/benchmarks/bench/partition/three_way.cu index 9b1fdb0e18d..ff53970d824 100644 --- a/cub/benchmarks/bench/partition/three_way.cu +++ b/cub/benchmarks/bench/partition/three_way.cu @@ -47,7 +47,7 @@ template struct policy_hub_t { - struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> + struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t> { using ThreeWayPartitionPolicy = // cub::AgentThreeWayPartitionPolicy + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index 9a62b073e75..481f9a4f2ae 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -55,7 +55,7 @@ struct reduce_by_key_policy_hub { - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu index 112b716ca86..398711fed80 100644 --- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu +++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -54,7 +54,7 @@ struct device_rle_policy_hub { - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using RleSweepPolicyT = cub::AgentRlePolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/segmented_sort/keys.cu b/cub/benchmarks/bench/segmented_sort/keys.cu index b3ecbf51656..8d793c67e44 100644 --- a/cub/benchmarks/bench/segmented_sort/keys.cu +++ b/cub/benchmarks/bench/segmented_sort/keys.cu @@ -109,7 +109,7 @@ struct device_seg_sort_policy_hub { using DominantT = KeyT; - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { static constexpr int BLOCK_THREADS = TUNE_THREADS; static constexpr int RADIX_BITS = TUNE_RADIX_BITS; @@ -143,7 +143,7 @@ struct device_seg_sort_policy_hub TUNE_M_LOAD_MODIFIER>>; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/select/unique_by_key.cu b/cub/benchmarks/bench/select/unique_by_key.cu index 7950aaeda2f..473aff6b589 100644 --- a/cub/benchmarks/bench/select/unique_by_key.cu +++ b/cub/benchmarks/bench/select/unique_by_key.cu @@ -53,7 +53,7 @@ struct policy_hub { - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using UniqueByKeyPolicyT = cub::AgentUniqueByKeyPolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h index d8339645429..3f8ad71f590 100644 --- a/cub/benchmarks/bench/transform/common.h +++ b/cub/benchmarks/bench/transform/common.h @@ -31,7 +31,7 @@ using policy_hub_t = cub::detail::transform::policy_hub + struct max_policy : cub::ChainedPolicy<500, max_policy, max_policy> { static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__); static constexpr auto algorithm = static_cast(TUNE_ALGORITHM); diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index cd377a6d991..16353f392dc 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -383,40 +383,6 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv // Tuning policies //--------------------------------------------------------------------- - /// SM35 - struct Policy350 - { - using SpmvPolicyT = - AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 96 : 128, - (sizeof(ValueT) > 4) ? 4 : 7, - LOAD_LDG, - LOAD_CA, - LOAD_LDG, - LOAD_LDG, - LOAD_LDG, - (sizeof(ValueT) > 4) ? true : false, - BLOCK_SCAN_WARP_SCANS>; - - using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>; - }; - - /// SM37 - struct Policy370 - { - using SpmvPolicyT = - AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 128 : 128, - (sizeof(ValueT) > 4) ? 9 : 14, - LOAD_LDG, - LOAD_CA, - LOAD_LDG, - LOAD_LDG, - LOAD_LDG, - false, - BLOCK_SCAN_WARP_SCANS>; - - using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>; - }; - /// SM50 struct Policy500 { @@ -459,15 +425,8 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv #if (CUB_PTX_ARCH >= 600) using PtxPolicy = Policy600; -#elif (CUB_PTX_ARCH >= 500) - using PtxPolicy = Policy500; - -#elif (CUB_PTX_ARCH >= 370) - using PtxPolicy = Policy370; - #else - using PtxPolicy = Policy350; - + using PtxPolicy = Policy500; #endif // "Opaque" policies (whose parameterizations aren't reflected in the type signature) @@ -502,12 +461,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv } else if (ptx_version >= 500) { spmv_config.template Init(); segment_fixup_config.template Init(); - } else if (ptx_version >= 370) { - spmv_config.template Init(); - segment_fixup_config.template Init(); } else { - spmv_config.template Init(); - segment_fixup_config.template Init(); + spmv_config.template Init(); + segment_fixup_config.template Init(); })); } diff --git a/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh b/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh index 20717e1c68a..b8d0a7557bd 100644 --- a/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh @@ -52,17 +52,7 @@ struct policy_hub { using ValueT = typename std::iterator_traits::value_type; - struct Policy300 : ChainedPolicy<300, Policy300, Policy300> - { - using AdjacentDifferencePolicy = - AgentAdjacentDifferencePolicy<128, - Nominal8BItemsToItems(7), - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE>; - }; - - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { using AdjacentDifferencePolicy = AgentAdjacentDifferencePolicy<128, @@ -72,7 +62,7 @@ struct policy_hub BLOCK_STORE_WARP_TRANSPOSE>; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; } // namespace adjacent_difference } // namespace detail diff --git a/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh b/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh index 86fe3374d89..d0ebefe0a1f 100644 --- a/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh @@ -75,8 +75,8 @@ struct policy_hub using buff_delay_constructor_t = detail::default_delay_constructor_t; using block_delay_constructor_t = detail::default_delay_constructor_t; - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + /// SM50 + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { static constexpr bool PREFER_POW2_BITS = true; using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy< @@ -95,7 +95,7 @@ struct policy_hub }; /// SM70 - struct Policy700 : ChainedPolicy<700, Policy700, Policy350> + struct Policy700 : ChainedPolicy<700, Policy700, Policy500> { static constexpr bool PREFER_POW2_BITS = false; using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy< diff --git a/cub/cub/device/dispatch/tuning/tuning_for.cuh b/cub/cub/device/dispatch/tuning/tuning_for.cuh index 759d7e632e5..d0ec964ca90 100644 --- a/cub/cub/device/dispatch/tuning/tuning_for.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_for.cuh @@ -49,12 +49,12 @@ namespace for_each struct policy_hub_t { - struct policy_350_t : ChainedPolicy<350, policy_350_t, policy_350_t> + struct policy_500_t : ChainedPolicy<500, policy_500_t, policy_500_t> { using for_policy_t = policy_t<256, 2>; }; - using MaxPolicy = policy_350_t; + using MaxPolicy = policy_500_t; }; } // namespace for_each diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 1a06c25cb92..bd19489971e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -136,15 +136,8 @@ struct policy_hub return (::cuda::std::max)(nominalItemsPerThread / NumActiveChannels / v_scale, 1); } - // SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> - { - // TODO This might be worth it to separate usual histogram and the multi one - using AgentHistogramPolicyT = AgentHistogramPolicy<128, t_scale(8), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>; - }; - // SM50 - struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { // TODO This might be worth it to separate usual histogram and the multi one using AgentHistogramPolicyT = diff --git a/cub/cub/device/dispatch/tuning/tuning_merge.cuh b/cub/cub/device/dispatch/tuning/tuning_merge.cuh index 0d69dd45b95..2521de6e9c3 100644 --- a/cub/cub/device/dispatch/tuning/tuning_merge.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_merge.cuh @@ -53,17 +53,7 @@ struct policy_hub using tune_type = char[has_values ? sizeof(KeyT) + sizeof(ValueT) : sizeof(KeyT)]; - struct policy300 : ChainedPolicy<300, policy300, policy300> - { - using merge_policy = - agent_policy_t<128, - Nominal4BItemsToItems(7), - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE>; - }; - - struct policy350 : ChainedPolicy<350, policy350, policy300> + struct policy500 : ChainedPolicy<500, policy500, policy500> { using merge_policy = agent_policy_t<256, @@ -73,7 +63,7 @@ struct policy_hub BLOCK_STORE_WARP_TRANSPOSE>; }; - struct policy520 : ChainedPolicy<520, policy520, policy350> + struct policy520 : ChainedPolicy<520, policy520, policy500> { using merge_policy = agent_policy_t<512, diff --git a/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh index 94d54b08509..29e98a3898a 100644 --- a/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh @@ -51,7 +51,7 @@ struct policy_hub { using KeyT = value_t; - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { using MergeSortPolicy = AgentMergeSortPolicy<256, @@ -63,9 +63,9 @@ struct policy_hub // NVBug 3384810 #if defined(_NVHPC_CUDA) - using Policy520 = Policy350; + using Policy520 = Policy500; #else - struct Policy520 : ChainedPolicy<520, Policy520, Policy350> + struct Policy520 : ChainedPolicy<520, Policy520, Policy500> { using MergeSortPolicy = AgentMergeSortPolicy<512, diff --git a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh index 99b8dbda413..72c464ec5ea 100644 --- a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh @@ -120,89 +120,8 @@ struct policy_hub // Architecture-specific tuning policies //------------------------------------------------------------------------------ - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> - { - enum - { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented - // keys/s (K40m) - ONESWEEP = false, - ONESWEEP_RADIX_BITS = 8, - }; - - // Histogram policy - using HistogramPolicy = AgentRadixSortHistogramPolicy<256, 8, 1, KeyT, ONESWEEP_RADIX_BITS>; - - // Exclusive sum policy - using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>; - - // Onesweep policy - using OnesweepPolicy = AgentRadixSortOnesweepPolicy< - 256, - 21, - DominantT, - 1, - RADIX_RANK_MATCH_EARLY_COUNTS_ANY, - BLOCK_SCAN_WARP_SCANS, - RADIX_SORT_STORE_DIRECT, - ONESWEEP_RADIX_BITS>; - - // Scan policy - using ScanPolicy = - AgentScanPolicy<1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS>; - - // Keys-only downsweep policies - using DownsweepPolicyKeys = AgentRadixSortDownsweepPolicy< - 128, - 9, - DominantT, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_LDG, - RADIX_RANK_MATCH, - BLOCK_SCAN_WARP_SCANS, - PRIMARY_RADIX_BITS>; - using AltDownsweepPolicyKeys = AgentRadixSortDownsweepPolicy< - 64, - 18, - DominantT, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - RADIX_RANK_MEMOIZE, - BLOCK_SCAN_WARP_SCANS, - PRIMARY_RADIX_BITS - 1>; - - // Key-value pairs downsweep policies - using DownsweepPolicyPairs = DownsweepPolicyKeys; - using AltDownsweepPolicyPairs = AgentRadixSortDownsweepPolicy< - 128, - 15, - DominantT, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - RADIX_RANK_MEMOIZE, - BLOCK_SCAN_WARP_SCANS, - PRIMARY_RADIX_BITS - 1>; - - // Downsweep policies - using DownsweepPolicy = ::cuda::std::_If; - - using AltDownsweepPolicy = ::cuda::std::_If; - - // Upsweep policies - using UpsweepPolicy = DownsweepPolicy; - using AltUpsweepPolicy = AltDownsweepPolicy; - - // Single-tile policy - using SingleTilePolicy = DownsweepPolicy; - - // Segmented policies - using SegmentedPolicy = DownsweepPolicy; - using AltSegmentedPolicy = AltDownsweepPolicy; - }; - /// SM50 - struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { enum { diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh index a87b6b9d6d6..d4719820752 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh @@ -79,26 +79,7 @@ CUB_RUNTIME_FUNCTION ReducePolicyWrapper MakeReducePolicyWrapper(Policy template struct policy_hub { - struct Policy300 : ChainedPolicy<300, Policy300, Policy300> - { - static constexpr int threads_per_block = 256; - static constexpr int items_per_thread = 20; - static constexpr int items_per_vec_load = 2; - - // ReducePolicy (GTX670: 154.0 @ 48M 4B items) - using ReducePolicy = - AgentReducePolicy; - - using SingleTilePolicy = ReducePolicy; - using SegmentedReducePolicy = ReducePolicy; - }; - - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 20; @@ -117,7 +98,7 @@ struct policy_hub using SegmentedReducePolicy = ReducePolicy; }; - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + struct Policy600 : ChainedPolicy<600, Policy600, Policy500> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 16; diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 41fbb2c49a4..a5ad19df8cc 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -633,9 +633,9 @@ struct policy_hub default_reduce_by_key_delay_constructor_t>; }; - struct Policy350 + struct Policy500 : DefaultPolicy - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick DefaultPolicy @@ -651,7 +651,7 @@ struct policy_hub template static auto select_agent_policy(long) -> typename DefaultPolicy::ReduceByKeyPolicyT; - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { using ReduceByKeyPolicyT = decltype(select_agent_policy()>>(0)); diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index 87631d1199e..d938209dcf2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -258,10 +258,10 @@ struct policy_hub default_reduce_by_key_delay_constructor_t>; }; - // SM35 - struct Policy350 + // SM50 + struct Policy500 : DefaultPolicy - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick the default @@ -277,7 +277,7 @@ struct policy_hub static auto select_agent_policy(long) -> typename DefaultPolicy::ReduceByKeyPolicyT; // SM80 - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { using ReduceByKeyPolicyT = decltype(select_agent_policy>(0)); }; @@ -451,10 +451,10 @@ struct policy_hub default_reduce_by_key_delay_constructor_t>; }; - // SM35 - struct Policy350 + // SM50 + struct Policy500 : DefaultPolicy // TODO(bgruber): I think we want `LengthT` instead of `int` - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick the default @@ -472,7 +472,7 @@ struct policy_hub typename DefaultPolicy::RleSweepPolicyT; // SM80 - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { using RleSweepPolicyT = decltype(select_agent_policy>(0)); }; diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index ae0d34ede32..7b076507341 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -273,13 +273,13 @@ struct policy_hub static constexpr BlockStoreAlgorithm scan_transposed_store = large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T using ScanPolicyT = AgentScanPolicy<128, 12, AccumT, BLOCK_LOAD_DIRECT, LOAD_CA, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, BLOCK_SCAN_RAKING>; }; - struct Policy520 : ChainedPolicy<520, Policy520, Policy350> + struct Policy520 : ChainedPolicy<520, Policy520, Policy500> { // Titan X: 32.47B items/s @ 48M 32-bit T using ScanPolicyT = diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index cdd2468dc38..f8e29201eea 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -717,7 +717,7 @@ struct policy_hub static constexpr int max_input_bytes = static_cast((::cuda::std::max)(sizeof(key_t), sizeof(AccumT))); static constexpr int combined_input_bytes = static_cast(sizeof(key_t) + sizeof(AccumT)); - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { static constexpr int nominal_4b_items_per_thread = 6; static constexpr int items_per_thread = @@ -752,7 +752,7 @@ struct policy_hub struct Policy520 : DefaultPolicy - , ChainedPolicy<520, Policy520, Policy350> + , ChainedPolicy<520, Policy520, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick the default diff --git a/cub/cub/device/dispatch/tuning/tuning_segmented_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_segmented_sort.cuh index fc442a4f982..308949d0916 100644 --- a/cub/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_segmented_sort.cuh @@ -53,33 +53,7 @@ struct policy_hub using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>; static constexpr int KEYS_ONLY = ::cuda::std::is_same::value; - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> - { - static constexpr int BLOCK_THREADS = 128; - static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; - static constexpr int PARTITIONING_THRESHOLD = 300; - - using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy< - BLOCK_THREADS, - 9, - DominantT, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - RADIX_RANK_MATCH, - BLOCK_SCAN_WARP_SCANS, - RADIX_BITS>; - - static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(5); - static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(5); - using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< - BLOCK_THREADS, - // Small policy - AgentSubWarpMergeSortPolicy<4, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>, - // Medium policy - AgentSubWarpMergeSortPolicy<32, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>; - }; - - struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { static constexpr int BLOCK_THREADS = 256; static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 792b1669fa1..10d22286068 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -566,12 +566,12 @@ struct policy_hub detail::fixed_delay_constructor_t<350, 450>>; }; - struct Policy350 + struct Policy500 : DefaultPolicy - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { // Use values from tuning if a specialization exists, otherwise pick the default template diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh index 3645e4b9ed7..08364fe381d 100644 --- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh @@ -239,9 +239,9 @@ struct policy_hub DelayConstructor>; }; - struct Policy350 + struct Policy500 : DefaultPolicy> - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick DefaultPolicy @@ -258,7 +258,7 @@ struct policy_hub static auto select_agent_policy(long) -> typename DefaultPolicy< default_delay_constructor_t::pack_t>>::ThreeWayPartitionPolicy; - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { using ThreeWayPartitionPolicy = decltype(select_agent_policy>(0)); }; diff --git a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh index f988d6fb29e..0c6b717de2c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh @@ -538,9 +538,9 @@ struct policy_hub detail::default_delay_constructor_t>; }; - struct Policy350 + struct Policy500 : DefaultPolicy<9, 128> - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick the default @@ -557,7 +557,7 @@ struct policy_hub struct Policy520 : DefaultPolicy<11, 64> - , ChainedPolicy<520, Policy520, Policy350> + , ChainedPolicy<520, Policy520, Policy500> {}; struct Policy800 : ChainedPolicy<800, Policy800, Policy520> diff --git a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu index db655b73404..12f0467d12b 100644 --- a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu +++ b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu @@ -258,13 +258,13 @@ struct device_rle_policy_hub static constexpr int threads = 96; static constexpr int items = 15; - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using RleSweepPolicyT = cub:: AgentRlePolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; struct CustomDeviceRunLengthEncode diff --git a/cub/test/catch2_test_util_device.cu b/cub/test/catch2_test_util_device.cu index 4c4d10a2ff1..da0eea09eba 100644 --- a/cub/test/catch2_test_util_device.cu +++ b/cub/test/catch2_test_util_device.cu @@ -125,9 +125,7 @@ C2H_TEST("PtxVersion returns a value from __CUDA_ARCH_LIST__/NV_TARGET_SM_INTEGE struct policy_hub_all { // for the list of supported architectures, see libcudacxx/include/nv/target - GEN_POLICY(350, 350); - GEN_POLICY(370, 350); - GEN_POLICY(500, 370); + GEN_POLICY(500, 500); GEN_POLICY(520, 500); GEN_POLICY(530, 520); GEN_POLICY(600, 530); @@ -258,8 +256,7 @@ DECLARE_TMPL_LAUNCH_WRAPPER(check_chained_policy_selects_correct_policy, struct policy_hub_some { - GEN_POLICY(350, 350); - GEN_POLICY(500, 350); + GEN_POLICY(500, 500); GEN_POLICY(700, 500); GEN_POLICY(900, 700); GEN_POLICY(2000, 900); // non-existing architecture, just to test @@ -268,30 +265,30 @@ struct policy_hub_some struct policy_hub_few { - GEN_POLICY(350, 350); - GEN_POLICY(860, 350); + GEN_POLICY(500, 500); + GEN_POLICY(860, 500); GEN_POLICY(2000, 860); // non-existing architecture, just to test using max_policy = policy2000; }; struct policy_hub_minimal { - GEN_POLICY(350, 350); - using max_policy = policy350; + GEN_POLICY(500, 500); + using max_policy = policy500; }; C2H_TEST("ChainedPolicy invokes correct policy", "[util][dispatch]") { SECTION("policy_hub_some") { - check_wrapper_some(::cuda::std::array{350, 500, 700, 900, 2000}); + check_wrapper_some(::cuda::std::array{500, 700, 900, 2000}); } SECTION("policy_hub_few") { - check_wrapper_some(::cuda::std::array{350, 860, 2000}); + check_wrapper_some(::cuda::std::array{500, 860, 2000}); } SECTION("policy_hub_minimal") { - check_wrapper_some(::cuda::std::array{350}); + check_wrapper_some(::cuda::std::array{500}); } } diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu index 6b16bde7fa9..557f2c152d0 100644 --- a/cub/test/catch2_test_vsmem.cu +++ b/cub/test/catch2_test_vsmem.cu @@ -198,7 +198,7 @@ struct device_dummy_algorithm_policy_t static constexpr int FALLBACK_BLOCK_THREADS = 64; - struct policy_350 : cub::ChainedPolicy<350, policy_350, policy_350> + struct policy_500 : cub::ChainedPolicy<500, policy_500, policy_500> { using DummyAlgorithmPolicy = agent_dummy_algorithm_policy_t<256, cub::Nominal4BItemsToItems(17)>; @@ -208,7 +208,7 @@ struct device_dummy_algorithm_policy_t }; /// MaxPolicy - using max_policy_t = policy_350; + using max_policy_t = policy_500; }; //---------------------------------------------------------------------------- @@ -422,9 +422,9 @@ C2H_TEST("Virtual shared memory works within algorithms", "[util][vsmem]", type_ c2h::gen(C2H_SEED(1), in); // Query default and fallback policies and agents so we can confirm vsmem - using default_policy_t = typename device_dummy_algorithm_policy_t::policy_350::DummyAlgorithmPolicy; + using default_policy_t = typename device_dummy_algorithm_policy_t::policy_500::DummyAlgorithmPolicy; using default_agent_t = agent_dummy_algorithm_t; - using fallback_policy_t = typename device_dummy_algorithm_policy_t::policy_350::FallbackDummyAlgorithmPolicy; + using fallback_policy_t = typename device_dummy_algorithm_policy_t::policy_500::FallbackDummyAlgorithmPolicy; using fallback_agent_t = agent_dummy_algorithm_t; // Get the information as it is expected from the vsmem helper to work as expected diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst index 4cc639e27fb..8b31dab6283 100644 --- a/docs/cub/developer_overview.rst +++ b/docs/cub/developer_overview.rst @@ -625,14 +625,14 @@ Finally, the tuning policy hub looks like: struct policy_hub { // TuningRelevantParams... could be used for decision making, like element types used, iterator category, etc. - // for SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> { + // for SM50 + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { using AlgorithmPolicy = AgentAlgorithmPolicy<256, 20, BLOCK_LOAD_DIRECT, LOAD_LDG>; // ... additional policies may exist, often one per agent }; // for SM60 - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> { + struct Policy600 : ChainedPolicy<600, Policy600, Policy500> { using AlgorithmPolicy = AgentAlgorithmPolicy<256, 16, BLOCK_LOAD_DIRECT, LOAD_LDG>; }; diff --git a/docs/repo.toml b/docs/repo.toml index 999d62a8f20..7ff29fd6eba 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -115,7 +115,7 @@ doxygen_aliases = [ "smemstorage{1}=The operations exposed by \\1 require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the ``__shared__`` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or ``union``'d with other storage allocation types to facilitate memory reuse.", "granularity=Efficiency is increased with increased granularity ``ITEMS_PER_THREAD``. Performance is also typically increased until the additional register pressure or shared memory allocation size causes SM occupancy to fall too low. Consider variants of ``cub::BlockLoad`` for efficiently gathering a :ref:`blocked arrangement ` of elements across threads.", "blocksize=The number of threads in the block is a multiple of the architecture's warp size", - "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the ``__CUDA_ARCH__`` macro (e.g., 350 for sm_35). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of ``__CUDA_ARCH__`` during the current compiler pass)", + "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the ``__CUDA_ARCH__`` macro (e.g., 750 for sm_75). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of ``__CUDA_ARCH__`` during the current compiler pass)", "blockcollective{1}=Every thread in the block uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.", "warpcollective{1}=Every thread in the warp uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking or more collective member functions.", "devicestorage=When ``d_temp_storage`` is ``nullptr``, no work is done and the required allocation size is returned in ``temp_storage_bytes``.", diff --git a/libcudacxx/test/libcudacxx/cuda/test_platform.pass.cpp b/libcudacxx/test/libcudacxx/cuda/test_platform.pass.cpp index 499c57a9a7e..25181f48a33 100644 --- a/libcudacxx/test/libcudacxx/cuda/test_platform.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/test_platform.pass.cpp @@ -65,10 +65,6 @@ __host__ __device__ void test() (static_assert(arch_val >= 520, "cuda arch expected 520");), NV_PROVIDES_SM_50, (static_assert(arch_val >= 500, "cuda arch expected 500");), - NV_PROVIDES_SM_37, - (static_assert(arch_val >= 370, "cuda arch expected 370");), - NV_PROVIDES_SM_35, - (static_assert(arch_val >= 350, "cuda arch expected 350");), NV_IS_HOST, (static_assert(arch_val == 0, "cuda arch expected 0");)) @@ -98,10 +94,6 @@ __host__ __device__ void test() (static_assert(arch_val == 520, "cuda arch expected 520");), NV_IS_EXACTLY_SM_50, (static_assert(arch_val == 500, "cuda arch expected 500");), - NV_IS_EXACTLY_SM_37, - (static_assert(arch_val == 370, "cuda arch expected 370");), - NV_IS_EXACTLY_SM_35, - (static_assert(arch_val == 350, "cuda arch expected 350");), NV_IS_HOST, (static_assert(arch_val == 0, "cuda arch expected 0");)) @@ -158,10 +150,6 @@ __host__ __device__ void test() (invoke_count += 1; invoke_count += threadIdx.x;), NV_PROVIDES_SM_50, (invoke_count += 1; invoke_count += threadIdx.x;), - NV_PROVIDES_SM_37, - (invoke_count += 1; invoke_count += threadIdx.x;), - NV_PROVIDES_SM_35, - (invoke_count += 1; invoke_count += threadIdx.x;), NV_IS_HOST, (invoke_count += 1;)) @@ -188,10 +176,6 @@ __host__ __device__ void test() (invoke_count += 1; invoke_count += threadIdx.x;), NV_IS_EXACTLY_SM_50, (invoke_count += 1; invoke_count += threadIdx.x;), - NV_IS_EXACTLY_SM_37, - (invoke_count += 1; invoke_count += threadIdx.x;), - NV_IS_EXACTLY_SM_35, - (invoke_count += 1; invoke_count += threadIdx.x;), NV_IS_HOST, (invoke_count += 1;)) @@ -252,10 +236,6 @@ void test() (static_assert(arch_val == 520, "cuda arch expected 520");), NV_PROVIDES_SM_50, (static_assert(arch_val == 500, "cuda arch expected 500");), - NV_PROVIDES_SM_37, - (static_assert(arch_val == 370, "cuda arch expected 370");), - NV_PROVIDES_SM_35, - (static_assert(arch_val == 350, "cuda arch expected 350");), NV_IS_HOST, (static_assert(arch_val == 0, "cuda arch expected 0");)) @@ -281,10 +261,6 @@ void test() (static_assert(arch_val == 520, "cuda arch expected 520");), NV_IS_EXACTLY_SM_50, (static_assert(arch_val == 500, "cuda arch expected 500");), - NV_IS_EXACTLY_SM_37, - (static_assert(arch_val == 370, "cuda arch expected 370");), - NV_IS_EXACTLY_SM_35, - (static_assert(arch_val == 350, "cuda arch expected 350");), NV_IS_HOST, (static_assert(arch_val == 0, "cuda arch expected 0");)) diff --git a/libcudacxx/test/support/concurrent_agents.h b/libcudacxx/test/support/concurrent_agents.h index 6b57b3531a0..6419613a5d8 100644 --- a/libcudacxx/test/support/concurrent_agents.h +++ b/libcudacxx/test/support/concurrent_agents.h @@ -13,10 +13,6 @@ #ifndef __CUDA_ARCH__ # include -#else -# if __CUDA_ARCH__ < 350 -# error "This test requires CUDA dynamic parallelism to work." -# endif #endif #include diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index a3c8994d777..94a7e750aeb 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -64,32 +64,17 @@ namespace core # if (__NVCOMPILER_CUDA_ARCH__ >= 600) // deprecated [since 2.8] # define THRUST_TUNING_ARCH sm60 -# elif (__NVCOMPILER_CUDA_ARCH__ >= 520) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm52 -# elif (__NVCOMPILER_CUDA_ARCH__ >= 350) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm35 # else // deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm30 +# define THRUST_TUNING_ARCH sm52 # endif #else # if (__CUDA_ARCH__ >= 600) // deprecated [since 2.8] # define THRUST_TUNING_ARCH sm60 -# elif (__CUDA_ARCH__ >= 520) +# else // deprecated [since 2.8] # define THRUST_TUNING_ARCH sm52 -# elif (__CUDA_ARCH__ >= 350) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm35 -# elif (__CUDA_ARCH__ >= 300) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm30 -# elif !defined(__CUDA_ARCH__) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm30 # endif #endif @@ -101,22 +86,7 @@ struct typelist; // supported SM arch // --------------------- -struct sm30 -{ - enum - { - ver = 300, - warpSize = 32 - }; -}; -struct sm35 -{ - enum - { - ver = 350, - warpSize = 32 - }; -}; + struct sm52 { enum @@ -137,7 +107,7 @@ struct sm60 // list of sm, checked from left to right order // the rightmost is the lowest sm arch supported // -------------------------------------------- -using sm_list = typelist; +using sm_list = typelist; // lowest supported SM arch // -------------------------------------------------------------------------- @@ -784,8 +754,6 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage( } } // namespace core -using core::sm30; -using core::sm35; using core::sm52; using core::sm60; } // namespace cuda_cub diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h index 443063fb3b4..3787ab62367 100644 --- a/thrust/thrust/system/cuda/detail/reduce.h +++ b/thrust/thrust/system/cuda/detail/reduce.h @@ -109,7 +109,7 @@ template struct Tuning; template -struct Tuning +struct Tuning { enum { @@ -119,18 +119,6 @@ struct Tuning SCALE_FACTOR_1B = sizeof(T), }; - using type = - PtxPolicy<256, - (((20 / SCALE_FACTOR_4B) > (1)) ? (20 / SCALE_FACTOR_4B) : (1)), - 2, - cub::BLOCK_REDUCE_WARP_REDUCTIONS, - cub::LOAD_DEFAULT, - cub::GRID_MAPPING_RAKE>; -}; // Tuning sm30 - -template -struct Tuning : Tuning -{ // ReducePolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items) using ReducePolicy1B = PtxPolicy<128, @@ -150,7 +138,7 @@ struct Tuning : Tuning cub::GRID_MAPPING_DYNAMIC>; using type = ::cuda::std::conditional_t<(sizeof(T) < 4), ReducePolicy1B, ReducePolicy4B>; -}; // Tuning sm35 +}; // Tuning sm52 template struct ReduceAgent diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h index cc59c98ab2c..ae1f0ffab96 100644 --- a/thrust/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h @@ -115,54 +115,7 @@ template struct Tuning; template -struct Tuning -{ - enum - { - MAX_INPUT_BYTES = mpl::max::value, - COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value), - - NOMINAL_4B_ITEMS_PER_THREAD = 6, - - ITEMS_PER_THREAD = - mpl::min(((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) - / COMBINED_INPUT_BYTES)>::value>::value, - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS>; -}; // Tuning sm30 - -template -struct Tuning : Tuning -{ - enum - { - MAX_INPUT_BYTES = mpl::max::value, - COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value), - - NOMINAL_4B_ITEMS_PER_THREAD = 6, - - ITEMS_PER_THREAD = - (MAX_INPUT_BYTES <= 8) - ? 6 - : mpl::min< - int, - NOMINAL_4B_ITEMS_PER_THREAD, - mpl::max:: - value>::value, - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS>; -}; // Tuning sm35 - -template -struct Tuning : Tuning +struct Tuning { enum { diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h index 0ef80c0fb2d..7a267080bf8 100644 --- a/thrust/thrust/system/cuda/detail/set_operations.h +++ b/thrust/thrust/system/cuda/detail/set_operations.h @@ -221,27 +221,6 @@ struct Tuning; namespace mpl = thrust::detail::mpl::math; -template -struct Tuning -{ - enum - { - MAX_INPUT_BYTES = mpl::max::value, - COMBINED_INPUT_BYTES = sizeof(T), // + sizeof(Value), - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = - mpl::min(((NOMINAL_4B_ITEMS_PER_THREAD * 4) + COMBINED_INPUT_BYTES - 1) - / COMBINED_INPUT_BYTES)>::value>::value, - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS>; -}; // tuning sm30 - template struct Tuning { diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h index b8e408254cb..ac94017758b 100644 --- a/thrust/thrust/system/cuda/detail/unique.h +++ b/thrust/thrust/system/cuda/detail/unique.h @@ -137,36 +137,6 @@ struct Tuning PtxPolicy<64, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS>; }; // Tuning for sm52 -template -struct Tuning -{ - const static int INPUT_SIZE = sizeof(T); - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - // - ITEMS_PER_THREAD = items_per_thread::value - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS>; -}; // Tuning for sm35 - -template -struct Tuning -{ - const static int INPUT_SIZE = sizeof(T); - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - // - ITEMS_PER_THREAD = items_per_thread::value - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS>; -}; // Tuning for sm30 - template struct UniqueAgent { From d21e0c9804ad63d23950c8b0a2462e5b7ebc8701 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 29 Jan 2025 22:52:12 +0100 Subject: [PATCH 09/33] PTX: Update generated files with Blackwell instructions (#3568) * ptx: Update existing instructions * ptx: Add new instructions * Fix returning error out values See: - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/74 - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/73 * ptx: Fix out var declaration See https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/75 * mbarrier.{test,try}_wait: Fix test. Wrong files were included. * docs: Fix special registers include * Allow non-included documentation pages * Workaround NVRTC Co-authored-by: Allard Hendriksen --- .../generated/barrier_cluster_aligned.rst | 63 + .../generated/clusterlaunchcontrol.rst | 68 + .../instructions/generated/cp_async_bulk.rst | 38 +- .../generated/cp_async_bulk_multicast.rst | 2 +- .../generated/cp_async_bulk_tensor.rst | 280 +- .../cp_async_bulk_tensor_gather_scatter.rst | 124 + .../cp_async_bulk_tensor_multicast.rst | 200 +- .../generated/cp_async_mbarrier_arrive.rst | 11 + .../cp_async_mbarrier_arrive_noinc.rst | 11 + .../generated/cp_reduce_async_bulk.rst | 6 +- .../ptx/instructions/generated/elect_sync.rst | 11 + .../ptx/instructions/generated/fence.rst | 170 +- .../generated/fence_proxy_async.rst | 6 +- ...ence_proxy_async_generic_sync_restrict.rst | 30 + .../generated/fence_sync_restrict.rst | 30 + .../{special_registers.rst => get_sreg.rst} | 99 +- .../ptx/instructions/generated/getctarank.rst | 2 +- .../ptx/instructions/generated/mapa.rst | 14 + .../generated/mbarrier_arrive.rst | 105 +- .../generated/mbarrier_arrive_expect_tx.rst | 54 +- .../generated/mbarrier_test_wait.rst | 34 +- .../generated/mbarrier_test_wait_parity.rst | 34 +- .../generated/mbarrier_try_wait.rst | 70 +- .../generated/mbarrier_try_wait_parity.rst | 70 +- .../generated/multimem_ld_reduce.rst | 2396 ++++++ .../instructions/generated/multimem_red.rst | 2306 ++++++ .../instructions/generated/multimem_st.rst | 250 + .../ptx/instructions/generated/red_async.rst | 32 +- .../ptx/instructions/generated/st_async.rst | 10 +- .../ptx/instructions/generated/st_bulk.rst | 13 + .../instructions/generated/tcgen05_alloc.rst | 70 + .../instructions/generated/tcgen05_commit.rst | 48 + .../ptx/instructions/generated/tcgen05_cp.rst | 434 ++ .../instructions/generated/tcgen05_fence.rst | 18 + .../ptx/instructions/generated/tcgen05_ld.rst | 758 ++ .../instructions/generated/tcgen05_mma.rst | 2378 ++++++ .../instructions/generated/tcgen05_mma_ws.rst | 4482 ++++++++++++ .../instructions/generated/tcgen05_shift.rst | 24 + .../ptx/instructions/generated/tcgen05_st.rst | 758 ++ .../instructions/generated/tcgen05_wait.rst | 18 + .../generated/tensormap_replace.rst | 114 +- .../ptx/instructions/special_registers.rst | 2 +- docs/repo.toml | 2 +- .../instructions/generated/barrier_cluster.h | 66 +- .../generated/barrier_cluster_aligned.h | 130 + .../generated/clusterlaunchcontrol.h | 240 + .../instructions/generated/cp_async_bulk.h | 153 +- .../generated/cp_async_bulk_commit_group.h | 12 +- .../generated/cp_async_bulk_multicast.h | 35 +- .../generated/cp_async_bulk_tensor.h | 849 ++- .../cp_async_bulk_tensor_gather_scatter.h | 288 + .../cp_async_bulk_tensor_multicast.h | 515 +- .../generated/cp_async_bulk_wait_group.h | 24 +- .../generated/cp_async_mbarrier_arrive.h | 26 + .../cp_async_mbarrier_arrive_noinc.h | 26 + .../generated/cp_reduce_async_bulk.h | 944 +-- .../generated/cp_reduce_async_bulk_bf16.h | 78 +- .../generated/cp_reduce_async_bulk_f16.h | 78 +- .../generated/cp_reduce_async_bulk_tensor.h | 788 +- .../__ptx/instructions/generated/elect_sync.h | 36 + .../cuda/__ptx/instructions/generated/fence.h | 224 +- .../generated/fence_mbarrier_init.h | 16 +- .../generated/fence_proxy_alias.h | 12 +- .../generated/fence_proxy_async.h | 44 +- .../fence_proxy_async_generic_sync_restrict.h | 62 + .../generated/fence_proxy_tensormap_generic.h | 96 +- .../generated/fence_sync_restrict.h | 62 + .../__ptx/instructions/generated/get_sreg.h | 506 +- .../__ptx/instructions/generated/getctarank.h | 21 +- .../cuda/__ptx/instructions/generated/mapa.h | 33 + .../instructions/generated/mbarrier_arrive.h | 338 +- .../generated/mbarrier_arrive_expect_tx.h | 153 +- .../generated/mbarrier_arrive_no_complete.h | 22 +- .../generated/mbarrier_expect_tx.h | 94 + .../instructions/generated/mbarrier_init.h | 12 +- .../generated/mbarrier_test_wait.h | 133 +- .../generated/mbarrier_test_wait_parity.h | 132 +- .../generated/mbarrier_try_wait.h | 278 +- .../generated/mbarrier_try_wait_parity.h | 278 +- .../generated/multimem_ld_reduce.h | 2148 ++++++ .../instructions/generated/multimem_red.h | 1272 ++++ .../instructions/generated/multimem_st.h | 186 + .../__ptx/instructions/generated/red_async.h | 335 +- .../__ptx/instructions/generated/st_async.h | 118 +- .../__ptx/instructions/generated/st_bulk.h | 31 + .../instructions/generated/tcgen05_alloc.h | 105 + .../instructions/generated/tcgen05_commit.h | 81 + .../__ptx/instructions/generated/tcgen05_cp.h | 612 ++ .../instructions/generated/tcgen05_fence.h | 44 + .../__ptx/instructions/generated/tcgen05_ld.h | 4446 ++++++++++++ .../instructions/generated/tcgen05_mma.h | 3842 ++++++++++ .../instructions/generated/tcgen05_mma_ws.h | 6438 +++++++++++++++++ .../instructions/generated/tcgen05_shift.h | 36 + .../__ptx/instructions/generated/tcgen05_st.h | 4554 ++++++++++++ .../instructions/generated/tcgen05_wait.h | 44 + .../generated/tensormap_cp_fenceproxy.h | 68 +- .../generated/tensormap_replace.h | 630 +- .../ptx/generated/barrier_cluster_aligned.h | 61 + .../cuda/ptx/generated/clusterlaunchcontrol.h | 84 + .../cuda/ptx/generated/cp_async_bulk.h | 29 +- .../ptx/generated/cp_async_bulk_multicast.h | 28 +- .../cuda/ptx/generated/cp_async_bulk_tensor.h | 325 +- .../cp_async_bulk_tensor_gather_scatter.h | 180 + .../cp_async_bulk_tensor_multicast.h | 405 +- .../ptx/generated/cp_async_mbarrier_arrive.h | 26 + .../cp_async_mbarrier_arrive_noinc.h | 26 + .../cuda/ptx/generated/elect_sync.h | 26 + .../libcudacxx/cuda/ptx/generated/fence.h | 64 +- .../fence_proxy_async_generic_sync_restrict.h | 38 + .../cuda/ptx/generated/fence_sync_restrict.h | 38 + .../test/libcudacxx/cuda/ptx/generated/mapa.h | 27 + .../cuda/ptx/generated/mbarrier_arrive.h | 56 + .../ptx/generated/mbarrier_arrive_expect_tx.h | 29 + .../cuda/ptx/generated/mbarrier_expect_tx.h | 50 + .../cuda/ptx/generated/mbarrier_test_wait.h | 55 + .../ptx/generated/mbarrier_test_wait_parity.h | 55 + .../cuda/ptx/generated/mbarrier_try_wait.h | 31 + .../ptx/generated/mbarrier_try_wait_parity.h | 32 + .../cuda/ptx/generated/mbarrier_wait.h | 24 - .../cuda/ptx/generated/mbarrier_wait_parity.h | 24 - .../cuda/ptx/generated/multimem_ld_reduce.h | 1020 +++ .../cuda/ptx/generated/multimem_red.h | 840 +++ .../cuda/ptx/generated/multimem_st.h | 110 + .../libcudacxx/cuda/ptx/generated/st_bulk.h | 26 + .../cuda/ptx/generated/tcgen05_alloc.h | 81 + .../cuda/ptx/generated/tcgen05_commit.h | 62 + .../cuda/ptx/generated/tcgen05_cp.h | 396 + .../cuda/ptx/generated/tcgen05_fence.h | 44 + .../cuda/ptx/generated/tcgen05_ld.h | 1012 +++ .../cuda/ptx/generated/tcgen05_mma.h | 2928 ++++++++ .../cuda/ptx/generated/tcgen05_mma_ws.h | 3570 +++++++++ .../cuda/ptx/generated/tcgen05_shift.h | 39 + .../cuda/ptx/generated/tcgen05_st.h | 1012 +++ .../cuda/ptx/generated/tcgen05_wait.h | 40 + .../cuda/ptx/generated/tensormap_replace.h | 390 +- .../libcudacxx/cuda/ptx/nvrtc_workaround.h | 34 + .../ptx/ptx.barrier.cluster.compile.pass.cpp | 2 + ...p.async.bulk.commit_group.compile.pass.cpp | 2 + .../ptx/ptx.cp.async.bulk.compile.pass.cpp | 2 + ...x.cp.async.bulk.multicast.compile.pass.cpp | 2 + .../ptx.cp.async.bulk.tensor.compile.pass.cpp | 2 + ...ync.bulk.tensor.multicast.compile.pass.cpp | 2 + ....cp.async.bulk.wait_group.compile.pass.cpp | 2 + .../ptx.cp.reduce.async.bulk.compile.pass.cpp | 2 + ....reduce.async.bulk.tensor.compile.pass.cpp | 2 + .../cuda/ptx/ptx.fence.compile.pass.cpp | 2 + .../cuda/ptx/ptx.get_sreg.compile.pass.cpp | 2 + .../cuda/ptx/ptx.getctarank.compile.pass.cpp | 2 + .../ptx/ptx.mbarrier.arrive.compile.pass.cpp | 2 + .../ptx/ptx.mbarrier.init.compile.pass.cpp | 2 + .../ptx/ptx.mbarrier.wait.compile.pass.cpp | 6 +- .../cuda/ptx/ptx.red.async.compile.pass.cpp | 2 + .../cuda/ptx/ptx.st.async.compile.pass.cpp | 2 + ...x.tensormap.cp_fenceproxy.compile.pass.cpp | 2 + .../ptx.tensormap.replace.compile.pass.cpp | 2 + 155 files changed, 58115 insertions(+), 2683 deletions(-) create mode 100644 docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/elect_sync.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst rename docs/libcudacxx/ptx/instructions/generated/{special_registers.rst => get_sreg.rst} (83%) create mode 100644 docs/libcudacxx/ptx/instructions/generated/mapa.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/multimem_red.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/multimem_st.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/st_bulk.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst new file mode 100644 index 00000000000..a24093ac7b6 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst @@ -0,0 +1,63 @@ +.. + This file was automatically generated. Do not edit. + +barrier.cluster.arrive.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_arrive( + cuda::ptx::dot_aligned_t); + +barrier.cluster.wait.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_wait( + cuda::ptx::dot_aligned_t); + +barrier.cluster.arrive.release.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 + // .sem = { .release } + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::dot_aligned_t); + +barrier.cluster.arrive.relaxed.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 + // .sem = { .relaxed } + // .aligned = { .aligned } + // Marked volatile + template + __device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_relaxed_t, + cuda::ptx::dot_aligned_t); + +barrier.cluster.wait.acquire.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 + // .sem = { .acquire } + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::dot_aligned_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst b/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst new file mode 100644 index 00000000000..b372c5bf33e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst @@ -0,0 +1,68 @@ +.. + This file was automatically generated. Do not edit. + +clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100 + template + __device__ static inline void clusterlaunchcontrol_try_cancel( + void* addr, + uint64_t* smem_bar); + +clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void clusterlaunchcontrol_try_cancel_multicast( + void* addr, + uint64_t* smem_bar); + +clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; // PTX ISA 86, SM_100 + template = true> + __device__ static inline bool clusterlaunchcontrol_query_cancel_is_canceled( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline void clusterlaunchcontrol_query_cancel_get_first_ctaid( + B32 (&block_dim)[4], + B128 try_cancel_response); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst index 4883d8495eb..2bb334f1971 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90 + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -17,11 +17,27 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes const uint32_t& size, uint64_t* smem_bar); +cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar); + cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } template @@ -37,7 +53,7 @@ cp.async.bulk.global.shared::cta.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 + // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -47,3 +63,19 @@ cp.async.bulk.global.shared::cta.bulk_group void* dstMem, const void* srcMem, const uint32_t& size); + +cp.async.bulk.global.shared::cta.bulk_group.cp_mask +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100 + // .dst = { .global } + // .src = { .shared::cta } + template + __device__ static inline void cp_async_bulk_cp_mask( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + const uint16_t& byteMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst index af027c0b623..396a04e468b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst index 1c21efdd0a3..9d44a10800b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst @@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -17,11 +17,63 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[1], uint64_t* smem_bar); +cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); + +cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); + +cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); + cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -36,7 +88,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -48,11 +100,63 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[2], uint64_t* smem_bar); +cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); + cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -67,7 +171,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -79,11 +183,63 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[3], uint64_t* smem_bar); +cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); + +cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); + +cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); + cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -98,7 +254,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -110,11 +266,63 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[4], uint64_t* smem_bar); +cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); + +cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); + +cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); + cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -129,7 +337,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -141,11 +349,63 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[5], uint64_t* smem_bar); +cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst new file mode 100644 index 00000000000..971f0213cb0 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst @@ -0,0 +1,124 @@ +.. + This file was automatically generated. Do not edit. + +cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .global } + // .src = { .shared::cta } + template + __device__ static inline void cp_async_bulk_tensor_tile_scatter4( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst index ac33a05b69f..8ea38a2e0ad 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst @@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -18,11 +18,49 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -35,11 +73,49 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -52,11 +128,49 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -69,11 +183,49 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -85,3 +237,41 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[5], uint64_t* smem_bar, const uint16_t& ctaMask); + +cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst new file mode 100644 index 00000000000..73ce222a9ec --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst @@ -0,0 +1,11 @@ +.. + This file was automatically generated. Do not edit. + +cp.async.mbarrier.arrive.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80 + template + __device__ static inline void cp_async_mbarrier_arrive( + uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst new file mode 100644 index 00000000000..31b7a2e5a2b --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst @@ -0,0 +1,11 @@ +.. + This file was automatically generated. Do not edit. + +cp.async.mbarrier.arrive.noinc.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80 + template + __device__ static inline void cp_async_mbarrier_arrive_noinc( + uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst index b043eb9f456..8228b69ed41 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst @@ -10,7 +10,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.an // .src = { .shared::cta } // .type = { .b32 } // .op = { .and } - template + template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -29,7 +29,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or // .src = { .shared::cta } // .type = { .b32 } // .op = { .or } - template + template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -48,7 +48,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xo // .src = { .shared::cta } // .type = { .b32 } // .op = { .xor } - template + template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst new file mode 100644 index 00000000000..bc909c54319 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst @@ -0,0 +1,11 @@ +.. + This file was automatically generated. Do not edit. + +elect.sync +^^^^^^^^^^ +.. code:: cuda + + // elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 + template + __device__ static inline bool elect_sync( + const uint32_t& membermask); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst index ed21fa80b6e..50137394587 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst @@ -5,94 +5,190 @@ fence.sc.cta ^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); fence.sc.gpu ^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); fence.sc.sys ^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); +fence.sc.cluster +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .sc } + // .scope = { .cluster } + template + __device__ static inline void fence( + cuda::ptx::sem_sc_t, + cuda::ptx::scope_cluster_t); + fence.acq_rel.cta ^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .acq_rel } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_t scope); fence.acq_rel.gpu ^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .acq_rel } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_t scope); fence.acq_rel.sys ^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .acq_rel } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_t scope); -fence.sc.cluster -^^^^^^^^^^^^^^^^ +fence.acq_rel.cluster +^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .acq_rel } // .scope = { .cluster } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_cluster_t); -fence.acq_rel.cluster +fence.acquire.cta +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.acquire.cluster ^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } - // .scope = { .cluster } - template + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_cluster_t); + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.acquire.gpu +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.acquire.sys +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.release.cta +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.release.cluster +^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.release.gpu +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.release.sys +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst index 8376e96ce6b..9f4000b675e 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst @@ -13,7 +13,7 @@ fence.proxy.async.global ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( @@ -23,7 +23,7 @@ fence.proxy.async.shared::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( @@ -33,7 +33,7 @@ fence.proxy.async.shared::cta ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst new file mode 100644 index 00000000000..e67c4852355 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst @@ -0,0 +1,30 @@ +.. + This file was automatically generated. Do not edit. + +fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .space = { .shared::cluster } + // .scope = { .cluster } + template + __device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); + +fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .space = { .shared::cta } + // .scope = { .cluster } + template + __device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst b/docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst new file mode 100644 index 00000000000..bae82190e25 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst @@ -0,0 +1,30 @@ +.. + This file was automatically generated. Do not edit. + +fence.acquire.sync_restrict::shared::cluster.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .space = { .shared::cluster } + // .scope = { .cluster } + template + __device__ static inline void fence_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); + +fence.release.sync_restrict::shared::cta.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .space = { .shared::cta } + // .scope = { .cluster } + template + __device__ static inline void fence_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst b/docs/libcudacxx/ptx/instructions/generated/get_sreg.rst similarity index 83% rename from docs/libcudacxx/ptx/instructions/generated/special_registers.rst rename to docs/libcudacxx/ptx/instructions/generated/get_sreg.rst index aa1add84781..9582c4384ff 100644 --- a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst +++ b/docs/libcudacxx/ptx/instructions/generated/get_sreg.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + tid.x ^^^^^ .. code:: cuda // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_tid_x(); tid.y @@ -11,7 +14,7 @@ tid.y .. code:: cuda // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_tid_y(); tid.z @@ -19,7 +22,7 @@ tid.z .. code:: cuda // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_tid_z(); ntid.x @@ -27,7 +30,7 @@ ntid.x .. code:: cuda // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ntid_x(); ntid.y @@ -35,7 +38,7 @@ ntid.y .. code:: cuda // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ntid_y(); ntid.z @@ -43,7 +46,7 @@ ntid.z .. code:: cuda // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ntid_z(); laneid @@ -51,7 +54,7 @@ laneid .. code:: cuda // mov.u32 sreg_value, %%laneid; // PTX ISA 13 - template + template __device__ static inline uint32_t get_sreg_laneid(); warpid @@ -59,7 +62,7 @@ warpid .. code:: cuda // mov.u32 sreg_value, %%warpid; // PTX ISA 13 - template + template __device__ static inline uint32_t get_sreg_warpid(); nwarpid @@ -67,7 +70,7 @@ nwarpid .. code:: cuda // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_nwarpid(); ctaid.x @@ -75,7 +78,7 @@ ctaid.x .. code:: cuda // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ctaid_x(); ctaid.y @@ -83,7 +86,7 @@ ctaid.y .. code:: cuda // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ctaid_y(); ctaid.z @@ -91,7 +94,7 @@ ctaid.z .. code:: cuda // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ctaid_z(); nctaid.x @@ -99,7 +102,7 @@ nctaid.x .. code:: cuda // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_nctaid_x(); nctaid.y @@ -107,7 +110,7 @@ nctaid.y .. code:: cuda // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_nctaid_y(); nctaid.z @@ -115,7 +118,7 @@ nctaid.z .. code:: cuda // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_nctaid_z(); smid @@ -123,7 +126,7 @@ smid .. code:: cuda // mov.u32 sreg_value, %%smid; // PTX ISA 13 - template + template __device__ static inline uint32_t get_sreg_smid(); nsmid @@ -131,7 +134,7 @@ nsmid .. code:: cuda // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_nsmid(); gridid @@ -139,7 +142,7 @@ gridid .. code:: cuda // mov.u64 sreg_value, %%gridid; // PTX ISA 30 - template + template __device__ static inline uint64_t get_sreg_gridid(); is_explicit_cluster @@ -147,7 +150,7 @@ is_explicit_cluster .. code:: cuda // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 - template + template __device__ static inline bool get_sreg_is_explicit_cluster(); clusterid.x @@ -155,7 +158,7 @@ clusterid.x .. code:: cuda // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_clusterid_x(); clusterid.y @@ -163,7 +166,7 @@ clusterid.y .. code:: cuda // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_clusterid_y(); clusterid.z @@ -171,7 +174,7 @@ clusterid.z .. code:: cuda // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_clusterid_z(); nclusterid.x @@ -179,7 +182,7 @@ nclusterid.x .. code:: cuda // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_nclusterid_x(); nclusterid.y @@ -187,7 +190,7 @@ nclusterid.y .. code:: cuda // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_nclusterid_y(); nclusterid.z @@ -195,7 +198,7 @@ nclusterid.z .. code:: cuda // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_nclusterid_z(); cluster_ctaid.x @@ -203,7 +206,7 @@ cluster_ctaid.x .. code:: cuda // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); cluster_ctaid.y @@ -211,7 +214,7 @@ cluster_ctaid.y .. code:: cuda // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); cluster_ctaid.z @@ -219,7 +222,7 @@ cluster_ctaid.z .. code:: cuda // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); cluster_nctaid.x @@ -227,7 +230,7 @@ cluster_nctaid.x .. code:: cuda // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); cluster_nctaid.y @@ -235,7 +238,7 @@ cluster_nctaid.y .. code:: cuda // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); cluster_nctaid.z @@ -243,7 +246,7 @@ cluster_nctaid.z .. code:: cuda // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); cluster_ctarank @@ -251,7 +254,7 @@ cluster_ctarank .. code:: cuda // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctarank(); cluster_nctarank @@ -259,7 +262,7 @@ cluster_nctarank .. code:: cuda // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctarank(); lanemask_eq @@ -267,7 +270,7 @@ lanemask_eq .. code:: cuda // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_eq(); lanemask_le @@ -275,7 +278,7 @@ lanemask_le .. code:: cuda // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_le(); lanemask_lt @@ -283,7 +286,7 @@ lanemask_lt .. code:: cuda // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_lt(); lanemask_ge @@ -291,7 +294,7 @@ lanemask_ge .. code:: cuda // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_ge(); lanemask_gt @@ -299,7 +302,7 @@ lanemask_gt .. code:: cuda // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_gt(); clock @@ -307,7 +310,7 @@ clock .. code:: cuda // mov.u32 sreg_value, %%clock; // PTX ISA 10 - template + template __device__ static inline uint32_t get_sreg_clock(); clock_hi @@ -315,7 +318,7 @@ clock_hi .. code:: cuda // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 - template + template __device__ static inline uint32_t get_sreg_clock_hi(); clock64 @@ -323,7 +326,7 @@ clock64 .. code:: cuda // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 - template + template __device__ static inline uint64_t get_sreg_clock64(); globaltimer @@ -331,7 +334,7 @@ globaltimer .. code:: cuda // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 - template + template __device__ static inline uint64_t get_sreg_globaltimer(); globaltimer_lo @@ -339,7 +342,7 @@ globaltimer_lo .. code:: cuda // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 - template + template __device__ static inline uint32_t get_sreg_globaltimer_lo(); globaltimer_hi @@ -347,7 +350,7 @@ globaltimer_hi .. code:: cuda // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 - template + template __device__ static inline uint32_t get_sreg_globaltimer_hi(); total_smem_size @@ -355,7 +358,7 @@ total_smem_size .. code:: cuda // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 - template + template __device__ static inline uint32_t get_sreg_total_smem_size(); aggr_smem_size @@ -363,7 +366,7 @@ aggr_smem_size .. code:: cuda // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 - template + template __device__ static inline uint32_t get_sreg_aggr_smem_size(); dynamic_smem_size @@ -371,7 +374,7 @@ dynamic_smem_size .. code:: cuda // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 - template + template __device__ static inline uint32_t get_sreg_dynamic_smem_size(); current_graph_exec @@ -379,5 +382,5 @@ current_graph_exec .. code:: cuda // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 - template + template __device__ static inline uint64_t get_sreg_current_graph_exec(); diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst index 374c182576f..19b3783086c 100644 --- a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst +++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst @@ -5,7 +5,7 @@ getctarank.shared::cluster.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 + // getctarank.space.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } template __device__ static inline uint32_t getctarank( diff --git a/docs/libcudacxx/ptx/instructions/generated/mapa.rst b/docs/libcudacxx/ptx/instructions/generated/mapa.rst new file mode 100644 index 00000000000..4ffc70d85d9 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mapa.rst @@ -0,0 +1,14 @@ +.. + This file was automatically generated. Do not edit. + +mapa.shared::cluster.u32 +^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 + // .space = { .shared::cluster } + template + __device__ static inline Tp* mapa( + cuda::ptx::space_cluster_t, + const Tp* addr, + uint32_t target_cta); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst index 21436e2b3ca..fea199e4747 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst @@ -24,7 +24,7 @@ mbarrier.arrive.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -39,7 +39,7 @@ mbarrier.arrive.release.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -54,7 +54,7 @@ mbarrier.arrive.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -70,7 +70,7 @@ mbarrier.arrive.release.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -86,7 +86,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -101,7 +101,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -112,3 +112,96 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); + +mbarrier.arrive.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr); + +mbarrier.arrive.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr); + +mbarrier.arrive.relaxed.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90 + // .space = { .shared::cluster } + // .sem = { .relaxed } + // .scope = { .cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.relaxed.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90 + // .space = { .shared::cluster } + // .sem = { .relaxed } + // .scope = { .cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst index 47c56eca31a..318a7eb5b98 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst @@ -5,7 +5,7 @@ mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -21,7 +21,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -37,7 +37,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 + // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -48,3 +48,51 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& tx_count); + +mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& txCount); + +mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& txCount); + +mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90 + // .space = { .shared::cluster } + // .sem = { .relaxed } + // .scope = { .cluster } + template + __device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& txCount); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst index d16b2ac07ac..88ec36b43ac 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst @@ -15,7 +15,7 @@ mbarrier.test_wait.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -29,7 +29,7 @@ mbarrier.test_wait.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -38,3 +38,33 @@ mbarrier.test_wait.acquire.cluster.shared::cta.b64 cuda::ptx::scope_t scope, uint64_t* addr, const uint64_t& state); + +mbarrier.test_wait.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); + +mbarrier.test_wait.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst index ec464b3398b..1496d6cbccb 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst @@ -15,7 +15,7 @@ mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -29,7 +29,7 @@ mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -38,3 +38,33 @@ mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 cuda::ptx::scope_t scope, uint64_t* addr, const uint32_t& phaseParity); + +mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst index 3dfdba46861..4d319a5b1e3 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst @@ -26,7 +26,7 @@ mbarrier.try_wait.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -40,7 +40,7 @@ mbarrier.try_wait.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -54,7 +54,7 @@ mbarrier.try_wait.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -69,7 +69,7 @@ mbarrier.try_wait.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -79,3 +79,65 @@ mbarrier.try_wait.acquire.cluster.shared::cta.b64 uint64_t* addr, const uint64_t& state, const uint32_t& suspendTimeHint); + +mbarrier.try_wait.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); + +mbarrier.try_wait.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst index 4e7af4bace5..6a51704cab4 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst @@ -26,7 +26,7 @@ mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -40,7 +40,7 @@ mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -54,7 +54,7 @@ mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -69,7 +69,7 @@ mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -79,3 +79,65 @@ mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 uint64_t* addr, const uint32_t& phaseParity, const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst b/docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst new file mode 100644 index 00000000000..cd9f32bf5f0 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst @@ -0,0 +1,2396 @@ +.. + This file was automatically generated. Do not edit. + +multimem.ld_reduce.weak.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.weak.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.weak.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.weak.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.weak.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.weak.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.weak.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.weak.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.weak.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.weak.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.weak.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.weak.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.weak.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .and } + template = true> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.weak.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .or } + template = true> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.weak.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .xor } + template = true> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.weak.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .and } + template = true> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.weak.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .or } + template = true> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.weak.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .xor } + template = true> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/multimem_red.rst b/docs/libcudacxx/ptx/instructions/generated/multimem_red.rst new file mode 100644 index 00000000000..095efaef45c --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/multimem_red.rst @@ -0,0 +1,2306 @@ +.. + This file was automatically generated. Do not edit. + +multimem.red.relaxed.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); diff --git a/docs/libcudacxx/ptx/instructions/generated/multimem_st.rst b/docs/libcudacxx/ptx/instructions/generated/multimem_st.rst new file mode 100644 index 00000000000..00695328b76 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/multimem_st.rst @@ -0,0 +1,250 @@ +.. + This file was automatically generated. Do not edit. + +multimem.st.weak.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .weak } + template = true> + __device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B32* addr, + B32 val); + +multimem.st.relaxed.cta.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.relaxed.cluster.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.relaxed.gpu.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.relaxed.sys.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.cta.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.cluster.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.gpu.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.sys.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.weak.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .weak } + template = true> + __device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B64* addr, + B64 val); + +multimem.st.relaxed.cta.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.relaxed.cluster.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.relaxed.gpu.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.relaxed.sys.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.cta.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.cluster.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.gpu.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.sys.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); diff --git a/docs/libcudacxx/ptx/instructions/generated/red_async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst index 658fe0a8f44..c575b808401 100644 --- a/docs/libcudacxx/ptx/instructions/generated/red_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst @@ -5,7 +5,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } template @@ -19,7 +19,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } template @@ -33,7 +33,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .min } template @@ -47,7 +47,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .max } template @@ -61,7 +61,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .add } template @@ -75,7 +75,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .min } template @@ -89,7 +89,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .max } template @@ -103,7 +103,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .add } template @@ -117,10 +117,10 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .b32 } // .op = { .and } - template + template = true> __device__ static inline void red_async( cuda::ptx::op_and_op_t, B32* dest, @@ -131,10 +131,10 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .b32 } // .op = { .or } - template + template = true> __device__ static inline void red_async( cuda::ptx::op_or_op_t, B32* dest, @@ -145,10 +145,10 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .b32 } // .op = { .xor } - template + template = true> __device__ static inline void red_async( cuda::ptx::op_xor_op_t, B32* dest, @@ -159,7 +159,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u64 } // .op = { .add } template @@ -173,7 +173,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } template __device__ static inline void red_async( diff --git a/docs/libcudacxx/ptx/instructions/generated/st_async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst index d00a152cf29..8cfc21ba0b5 100644 --- a/docs/libcudacxx/ptx/instructions/generated/st_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst @@ -5,7 +5,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.type [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -17,7 +17,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.type [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -29,7 +29,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.type [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -41,7 +41,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.type [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -54,7 +54,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 .. code:: cuda // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 - template + template = true> __device__ static inline void st_async( B32* addr, const B32 (&value)[4], diff --git a/docs/libcudacxx/ptx/instructions/generated/st_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/st_bulk.rst new file mode 100644 index 00000000000..817d3875fdc --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/st_bulk.rst @@ -0,0 +1,13 @@ +.. + This file was automatically generated. Do not edit. + +st.bulk.weak.shared::cta +^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // st.bulk.weak.shared::cta [addr], size, initval; // PTX ISA 86, SM_100 + template + __device__ static inline void st_bulk( + void* addr, + uint64_t size, + cuda::ptx::n32_t initval); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst new file mode 100644 index 00000000000..3bfb60fca71 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst @@ -0,0 +1,70 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.alloc.cta_group.sync.aligned.shared::cta.b32 [dst], nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_alloc( + cuda::ptx::cta_group_t cta_group, + uint32_t* dst, + const uint32_t& nCols); + +tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.alloc.cta_group.sync.aligned.shared::cta.b32 [dst], nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_alloc( + cuda::ptx::cta_group_t cta_group, + uint32_t* dst, + const uint32_t& nCols); + +tcgen05.dealloc.cta_group::1.sync.aligned.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.dealloc.cta_group.sync.aligned.b32 taddr, nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_dealloc( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + const uint32_t& nCols); + +tcgen05.dealloc.cta_group::2.sync.aligned.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.dealloc.cta_group.sync.aligned.b32 taddr, nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_dealloc( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + const uint32_t& nCols); + +tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_relinquish_alloc_permit( + cuda::ptx::cta_group_t cta_group); + +tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_relinquish_alloc_permit( + cuda::ptx::cta_group_t cta_group); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst new file mode 100644 index 00000000000..d5546fed3e5 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst @@ -0,0 +1,48 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar); + +tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar); + +tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit_multicast( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar, + uint16_t ctaMask); + +tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit_multicast( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar, + uint16_t ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst new file mode 100644 index 00000000000..b0195c5b28e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst @@ -0,0 +1,434 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.cp.cta_group::1.128x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.4x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.4x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x128b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x128b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.32x128b.warpx4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.32x128b.warpx4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst new file mode 100644 index 00000000000..ee287ea8860 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst @@ -0,0 +1,18 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.fence::before_thread_sync +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.fence::before_thread_sync; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_fence_before_thread_sync(); + +tcgen05.fence::after_thread_sync +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.fence::after_thread_sync; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_fence_after_thread_sync(); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst new file mode 100644 index 00000000000..0bb6bdbb5f5 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst @@ -0,0 +1,758 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.ld.sync.aligned.16x64b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x32bx2.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst new file mode 100644 index 00000000000..aa5a1675193 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst @@ -0,0 +1,2378 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst new file mode 100644 index 00000000000..cb900a0ec40 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst @@ -0,0 +1,4482 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst new file mode 100644 index 00000000000..54e665ed3cc --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst @@ -0,0 +1,24 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.shift.cta_group::1.down +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.shift.cta_group.down [taddr]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_shift_down( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr); + +tcgen05.shift.cta_group::2.down +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.shift.cta_group.down [taddr]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_shift_down( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst new file mode 100644 index 00000000000..3147a1757d8 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst @@ -0,0 +1,758 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.st.sync.aligned.16x64b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x64b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x64b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x64b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x64b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x64b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x64b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x64b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x128b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x128b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x128b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x128b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x128b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x128b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x128b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x256b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x256b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x256b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x256b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x256b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x256b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.32x32b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.32x32b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.32x32b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.32x32b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.32x32b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.32x32b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.32x32b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.32x32b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x32bx2.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x32bx2.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x32bx2.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x32bx2.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x32bx2.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x32bx2.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x32bx2.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x32bx2.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst new file mode 100644 index 00000000000..ec48818eecc --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst @@ -0,0 +1,18 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.wait::ld.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.wait::ld.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_wait_ld(); + +tcgen05.wait::st.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.wait::st.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_wait_st(); diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst index a8c4a260782..fbf010d6009 100644 --- a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst @@ -5,9 +5,9 @@ tensormap.replace.tile.global_address.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_global_t, void* tm_addr, @@ -17,9 +17,9 @@ tensormap.replace.tile.global_address.shared::cta.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_shared_t, void* tm_addr, @@ -29,9 +29,9 @@ tensormap.replace.tile.rank.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_global_t, void* tm_addr, @@ -41,9 +41,9 @@ tensormap.replace.tile.rank.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_shared_t, void* tm_addr, @@ -53,9 +53,9 @@ tensormap.replace.tile.box_dim.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -66,9 +66,9 @@ tensormap.replace.tile.box_dim.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -79,9 +79,9 @@ tensormap.replace.tile.global_dim.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -92,9 +92,9 @@ tensormap.replace.tile.global_dim.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -105,9 +105,9 @@ tensormap.replace.tile.global_stride.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_global_t, void* tm_addr, @@ -118,9 +118,9 @@ tensormap.replace.tile.global_stride.shared::cta.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_shared_t, void* tm_addr, @@ -131,9 +131,35 @@ tensormap.replace.tile.element_stride.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> + __device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +tensormap.replace.tile.element_stride.shared::cta.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a + // .space = { .shared::cta } + template = true> + __device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +tensormap.replace.tile.element_stride.global.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a + // .space = { .global } + template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_global_t, void* tm_addr, @@ -144,9 +170,9 @@ tensormap.replace.tile.element_stride.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_shared_t, void* tm_addr, @@ -157,7 +183,7 @@ tensormap.replace.tile.elemtype.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_elemtype( @@ -169,7 +195,7 @@ tensormap.replace.tile.elemtype.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_elemtype( @@ -181,7 +207,7 @@ tensormap.replace.tile.interleave_layout.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_interleave_layout( @@ -193,7 +219,7 @@ tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_interleave_layout( @@ -205,7 +231,7 @@ tensormap.replace.tile.swizzle_mode.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -217,7 +243,7 @@ tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -229,7 +255,7 @@ tensormap.replace.tile.fill_mode.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_fill_mode( @@ -241,10 +267,34 @@ tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_fill_mode( cuda::ptx::space_shared_t, void* tm_addr, cuda::ptx::n32_t new_val); + +tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a + // .space = { .global } + template + __device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a + // .space = { .shared::cta } + template + __device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); diff --git a/docs/libcudacxx/ptx/instructions/special_registers.rst b/docs/libcudacxx/ptx/instructions/special_registers.rst index 1e9597fa726..1981f7fb908 100644 --- a/docs/libcudacxx/ptx/instructions/special_registers.rst +++ b/docs/libcudacxx/ptx/instructions/special_registers.rst @@ -6,4 +6,4 @@ Special registers - PTX ISA: `Special Register `__ -.. include:: generated/special_registers.rst +.. include:: generated/get_sreg.rst diff --git a/docs/repo.toml b/docs/repo.toml index 7ff29fd6eba..08ce4e58775 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -54,7 +54,7 @@ api_output_directory = "api" use_fast_doxygen_conversion = true sphinx_generate_doxygen_groups = true sphinx_generate_doxygen_pages = true -sphinx_exclude_patterns = [] +sphinx_exclude_patterns = ['ptx/instructions/generated'] [repo_docs.projects.cub] name = "CUB" diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h index c8ce41c0a20..75a72db7024 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h @@ -14,12 +14,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_ template _CCCL_DEVICE static inline void barrier_cluster_arrive() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive;" : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 780 @@ -34,12 +34,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_be template _CCCL_DEVICE static inline void barrier_cluster_wait() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait;" : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 780 @@ -56,13 +56,13 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_ template _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) { - // __sem == sem_release (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.release;" : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.release;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -79,13 +79,13 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_ template _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) { - // __sem == sem_relaxed (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.relaxed;" : : :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_relaxed (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.relaxed;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -102,13 +102,13 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_be template _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) { - // __sem == sem_acquire (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait.acquire;" : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +// __sem == sem_acquire (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait.acquire;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h new file mode 100644 index 00000000000..80fe3796e69 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h @@ -0,0 +1,130 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ +#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ + +/* +// barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(dot_aligned_t) +{ +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait( + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait(dot_aligned_t) +{ +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 +// .sem = { .release } +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t, dot_aligned_t) +{ +// __sem == sem_release (due to parameter type constraint) +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.release.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 +// .sem = { .relaxed } +// .aligned = { .aligned } +// Marked volatile +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_relaxed_t, + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t, dot_aligned_t) +{ +// __sem == sem_relaxed (due to parameter type constraint) +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.relaxed.aligned;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 +// .sem = { .acquire } +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t, dot_aligned_t) +{ +// __sem == sem_acquire (due to parameter type constraint) +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait.acquire.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h b/libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h new file mode 100644 index 00000000000..19e3f92bd13 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h @@ -0,0 +1,240 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CLUSTERLAUNCHCONTROL_H_ +#define _CUDA_PTX_GENERATED_CLUSTERLAUNCHCONTROL_H_ + +/* +// clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], [smem_bar]; // PTX ISA +86, SM_100 template +__device__ static inline void clusterlaunchcontrol_try_cancel( + void* addr, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_clusterlaunchcontrol_try_cancel_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void clusterlaunchcontrol_try_cancel(void* __addr, _CUDA_VSTD::uint64_t* __smem_bar) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [%0], [%1];" + : + : "r"(__as_ptr_smem(__addr)), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_try_cancel_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a template +__device__ static inline void clusterlaunchcontrol_try_cancel_multicast( + void* addr, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_try_cancel_multicast_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void clusterlaunchcontrol_try_cancel_multicast(void* __addr, _CUDA_VSTD::uint64_t* __smem_bar) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 " + "[%0], [%1];" + : + : "r"(__as_ptr_smem(__addr)), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_try_cancel_multicast_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; // PTX ISA 86, SM_100 +template = true> +__device__ static inline bool clusterlaunchcontrol_query_cancel_is_canceled( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_clusterlaunchcontrol_query_cancel_is_canceled_is_not_supported_before_SM_100__(); +template = true> +_CCCL_DEVICE static inline bool clusterlaunchcontrol_query_cancel_is_canceled(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __pred_is_canceled; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "{\n\t .reg .pred P_OUT; \n\t" + "clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 P_OUT, B128_try_cancel_response;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}\n\t" + "}" + : "=r"(__pred_is_canceled) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return static_cast(__pred_is_canceled); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_is_canceled_is_not_supported_before_SM_100__(); + return false; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_x_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline _B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __ret_dim; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 %0, B128_try_cancel_response;\n\t" + "}" + : "=r"(__ret_dim) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return *reinterpret_cast<_B32*>(&__ret_dim); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_x_is_not_supported_before_SM_100__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_y_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline _B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __ret_dim; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 %0, B128_try_cancel_response;\n\t" + "}" + : "=r"(__ret_dim) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return *reinterpret_cast<_B32*>(&__ret_dim); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_y_is_not_supported_before_SM_100__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_z_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline _B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __ret_dim; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 %0, B128_try_cancel_response;\n\t" + "}" + : "=r"(__ret_dim) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return *reinterpret_cast<_B32*>(&__ret_dim); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_z_is_not_supported_before_SM_100__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline void clusterlaunchcontrol_query_cancel_get_first_ctaid( + B32 (&block_dim)[4], + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline void +clusterlaunchcontrol_query_cancel_get_first_ctaid(_B32 (&__block_dim)[4], _B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%4, %5}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 {%0, %1, %2, %3}, B128_try_cancel_response;\n\t" + "}" + : "=r"(__block_dim[0]), "=r"(__block_dim[1]), "=r"(__block_dim[2]), "=r"(__block_dim[3]) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_CLUSTERLAUNCHCONTROL_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h index d2196402e7a..a9aa3534611 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h @@ -4,8 +4,7 @@ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ /* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, -SM_90 +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -28,23 +27,60 @@ _CCCL_DEVICE static inline void cp_async_bulk( const _CUDA_VSTD::uint32_t& __size, _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // " - "1a. unicast" : : "r"(__as_ptr_smem(__dstMem)), - "l"(__as_ptr_gmem(__srcMem)), - "r"(__size), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } template @@ -67,23 +103,25 @@ _CCCL_DEVICE static inline void cp_async_bulk( const _CUDA_VSTD::uint32_t& __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. " : : "r"( - __as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -100,17 +138,56 @@ template _CCCL_DEVICE static inline void cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. " : : "l"(__as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_cp_mask( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + const uint16_t& byteMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_cp_mask_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_cp_mask( + space_global_t, + space_shared_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + const _CUDA_VSTD::uint16_t& __byteMask) +{ +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%0], [%1], %2, %3;" + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size), "h"(__byteMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_cp_mask_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h index 3c32743e977..3b906fd6922 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h @@ -13,12 +13,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_suppor template _CCCL_DEVICE static inline void cp_async_bulk_commit_group() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.commit_group;" : : :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.commit_group;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h index f54bf8bbdeb..7ac386343b9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h @@ -4,8 +4,8 @@ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ /* -// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], -ctaMask; // 1. PTX ISA 80, SM_90a +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; +// PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -19,7 +19,7 @@ __device__ static inline void cp_async_bulk( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk( space_cluster_t, @@ -30,19 +30,22 @@ _CCCL_DEVICE static inline void cp_async_bulk( _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], " - "%4; // 1. " : : "r"(__as_ptr_smem(__dstMem)), - "l"(__as_ptr_gmem(__srcMem)), - "r"(__size), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__as_ptr_gmem(__srcMem)), + "r"(__size), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h index f7c60bb72f6..2326346f547 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h @@ -4,8 +4,8 @@ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ /* -// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1a. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -28,23 +28,116 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[1], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// " - "1a." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2}], " + "[%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2}], " + "[%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -65,22 +158,23 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[1], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a." : : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2];" + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1b. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -103,24 +197,132 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[2], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], " - "[%4];// 1b." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, " + "%3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, " + "%3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -141,23 +343,23 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[2], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b." : : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3];" + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1c. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -180,25 +382,136 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[3], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], " - "[%5];// 1c." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, " + "%4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, " + "%4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -219,25 +532,27 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[3], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c." : : "l"( - __tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1d. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -260,26 +575,141 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[4], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " - "%5}], [%6];// 1d." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5}], " + "[%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5}], [%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, " + "%4, %5}], [%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, " + "%4, %5}], [%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -300,26 +730,28 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[4], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d." : : "l"( - __tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1e. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -342,27 +774,146 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[5], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " - "%6}], [%7];// 1e." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " + "%6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, %6}], " + "[%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, " + "%4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, " + "%4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -383,21 +934,23 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[5], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e." : : "l"( - __tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h new file mode 100644 index 00000000000..f376f1b48c3 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h @@ -0,0 +1,288 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " + "%5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, " + "{%2, %3, %4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, " + "{%2, %3, %4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster " + "[%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], +[tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster." + "cta_group::1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster." + "cta_group::2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, +SM_100a, SM_101a +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor_tile_scatter4( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_scatter4( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + const void* __srcMem) +{ +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h index 56c199d39ff..b0d845b92a0 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h @@ -5,7 +5,7 @@ /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -19,7 +19,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -29,26 +29,95 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[1], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2}], [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2}], [%3], %4; // 2a." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2}], [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2}], [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -62,7 +131,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -72,27 +141,98 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[2], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3}], [%4], %5;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3}], [%4], %5; // 2b." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3}], [%4], %5;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3}], [%4], %5;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -106,7 +246,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -116,28 +256,101 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[3], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3, %4}], [%5], %6;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4}], [%5], %6; // 2c." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3, %4}], [%5], %6;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3, %4}], [%5], %6;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -151,7 +364,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -161,29 +374,104 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[4], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3, %4, %5}], [%6], %7;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -197,7 +485,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -207,24 +495,101 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[5], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h index 85b1507f721..b0373a3e6a7 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h @@ -14,12 +14,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supporte template _CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group %0;" : : "n"(__N.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.wait_group %0;" : : "n"(__N.value) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -34,12 +34,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_sup template _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(__N.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(__N.value) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h new file mode 100644 index 00000000000..b2bf07247c1 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_H_ + +/* +// cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80 +template +__device__ static inline void cp_async_mbarrier_arrive( + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline void cp_async_mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + asm("cp.async.mbarrier.arrive.b64 [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__(); +# endif +} +#endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h new file mode 100644 index 00000000000..816a3fc63b9 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_NOINC_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_NOINC_H_ + +/* +// cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80 +template +__device__ static inline void cp_async_mbarrier_arrive_noinc( + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline void cp_async_mbarrier_arrive_noinc(_CUDA_VSTD::uint64_t* __addr) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + asm("cp.async.mbarrier.arrive.noinc.b64 [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__(); +# endif +} +#endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_NOINC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h index 9b1bf35b290..499fda57c91 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h @@ -10,7 +10,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .b32 } // .op = { .and } -template +template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -22,7 +22,7 @@ __device__ static inline void cp_reduce_async_bulk( */ #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_cluster_t, space_shared_t, @@ -32,20 +32,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_b32 (due to parameter type constraint) +// __op == op_and_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -56,7 +59,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .b32 } // .op = { .or } -template +template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -68,7 +71,7 @@ __device__ static inline void cp_reduce_async_bulk( */ #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_cluster_t, space_shared_t, @@ -78,20 +81,22 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_b32 (due to parameter type constraint) +// __op == op_or_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; // 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -102,7 +107,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .b32 } // .op = { .xor } -template +template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -114,7 +119,7 @@ __device__ static inline void cp_reduce_async_bulk( */ #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_cluster_t, space_shared_t, @@ -124,20 +129,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_b32 (due to parameter type constraint) +// __op == op_xor_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -170,20 +178,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -216,20 +227,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -262,20 +276,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -308,20 +325,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_inc (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -354,20 +374,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_dec (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -400,20 +423,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -446,20 +472,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -492,20 +521,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -538,20 +570,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -584,20 +619,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 2." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; // " + "2." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -625,24 +663,26 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // __space == space_global (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __op == op_and_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -670,24 +710,26 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // __space == space_global (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __op == op_or_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -715,24 +757,26 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // __space == space_global (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __op == op_xor_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -762,19 +806,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -804,19 +848,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -846,19 +890,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -888,19 +932,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_inc (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -930,19 +974,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_dec (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -972,19 +1016,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1014,19 +1058,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1056,19 +1100,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1098,19 +1142,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1140,19 +1184,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1182,19 +1226,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1224,19 +1268,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1266,19 +1310,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1303,19 +1347,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1340,19 +1384,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1382,19 +1426,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h index da5cdb6bc9b..5c177976468 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h @@ -29,19 +29,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const __nv_bfloat16* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_bf16 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -71,19 +71,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const __nv_bfloat16* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_bf16 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -113,19 +113,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const __nv_bfloat16* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_bf16 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h index 3d9d4520dcb..95d775d09e2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h @@ -24,19 +24,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f16 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -61,19 +61,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f16 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -98,19 +98,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f16 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h index 9ec5b2443d8..540b0e95ed5 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h @@ -34,53 +34,67 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -115,53 +129,67 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -196,85 +224,99 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -309,93 +351,107 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -430,109 +486,115 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " - "1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h new file mode 100644 index 00000000000..e8691178f14 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h @@ -0,0 +1,36 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_ELECT_SYNC_H_ +#define _CUDA_PTX_GENERATED_ELECT_SYNC_H_ + +/* +// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 +template +__device__ static inline bool elect_sync( + const uint32_t& membermask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool elect_sync(const _CUDA_VSTD::uint32_t& __membermask) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __is_elected; + asm volatile( + "{\n\t .reg .pred P_OUT; \n\t" + "elect.sync _|P_OUT, %1;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__is_elected) + : "r"(__membermask) + :); + return static_cast(__is_elected); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); + return false; +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_ELECT_SYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h index db00c4d4cba..c0bd9e9a3d2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h @@ -4,71 +4,205 @@ #define _CUDA_PTX_GENERATED_FENCE_H_ /* -// fence{.sem}.scope; // 1. PTX ISA 60, SM_70 -// .sem = { .sc, .acq_rel } +// fence.sem.scope; // 1. PTX ISA 60, SM_70 +// .sem = { .sc } // .scope = { .cta, .gpu, .sys } -template +template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); */ #if __cccl_ptx_isa >= 600 extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope) +template +_CCCL_DEVICE static inline void fence(sem_sc_t, scope_t<_Scope> __scope) { - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); + // __sem == sem_sc (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { - asm volatile("fence.sc.cta; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { - asm volatile("fence.sc.gpu; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { - asm volatile("fence.sc.sys; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { - asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { - asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { - asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_70__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.sc.cta; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.sc.gpu; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.sc.sys; // 1." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_70__(); +# endif } #endif // __cccl_ptx_isa >= 600 /* -// fence{.sem}.scope; // 2. PTX ISA 78, SM_90 -// .sem = { .sc, .acq_rel } +// fence.sem.scope; // 2. PTX ISA 78, SM_90 +// .sem = { .sc } // .scope = { .cluster } -template +template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_cluster_t); */ #if __cccl_ptx_isa >= 780 extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) +template +_CCCL_DEVICE static inline void fence(sem_sc_t, scope_cluster_t) { - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc) { - asm volatile("fence.sc.cluster; // 2." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel) { - asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_90__();)); +// __sem == sem_sc (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.sc.cluster; // 2." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 780 +/* +// fence.sem.scope; // 1. PTX ISA 60, SM_70 +// .sem = { .acq_rel } +// .scope = { .cta, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_acq_rel_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 600 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); +template +_CCCL_DEVICE static inline void fence(sem_acq_rel_t, scope_t<_Scope> __scope) +{ + // __sem == sem_acq_rel (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_70__(); +# endif +} +#endif // __cccl_ptx_isa >= 600 + +/* +// fence.sem.scope; // 2. PTX ISA 78, SM_90 +// .sem = { .acq_rel } +// .scope = { .cluster } +template +__device__ static inline void fence( + cuda::ptx::sem_acq_rel_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_acq_rel_t, scope_cluster_t) +{ +// __sem == sem_acq_rel (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +/* +// fence.sem.scope; // PTX ISA 86, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_acquire_t, scope_t<_Scope> __scope) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.acquire.cta;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.acquire.cluster;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.acquire.gpu;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.acquire.sys;" : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// fence.sem.scope; // PTX ISA 86, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_release_t, scope_t<_Scope> __scope) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.release.cta;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.release.cluster;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.release.gpu;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.release.sys;" : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h index f8c4e6cf476..6b0c8ec161d 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h @@ -17,14 +17,14 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_bef template _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.mbarrier_init.release.cluster; // 3." : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.mbarrier_init.release.cluster; // 3." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h index cc413a0f511..e520d99bfaa 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h @@ -13,12 +13,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_befor template _CCCL_DEVICE static inline void fence_proxy_alias() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("fence.proxy.alias; // 4." : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700 + asm volatile("fence.proxy.alias; // 4." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__(); +# endif } #endif // __cccl_ptx_isa >= 750 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h index 176d24ff73f..f8ee49909db 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h @@ -13,17 +13,17 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_befor template _CCCL_DEVICE static inline void fence_proxy_async() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.proxy.async; // 5." : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.proxy.async; // 5." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 +// fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( @@ -35,19 +35,23 @@ template _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) { static_assert(__space == space_global || __space == space_cluster || __space == space_shared, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__space == space_global) { - asm volatile("fence.proxy.async.global; // 6." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__space == space_cluster) { - asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__space == space_shared) { - asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__space == space_global) + { + asm volatile("fence.proxy.async.global; // 6." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__space == space_cluster) + { + asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__space == space_shared) + { + asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h new file mode 100644 index 00000000000..93c66063ea3 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h @@ -0,0 +1,62 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_GENERIC_SYNC_RESTRICT_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_GENERIC_SYNC_RESTRICT_H_ + +/* +// fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .acquire } +// .space = { .shared::cluster } +// .scope = { .cluster } +template +__device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async_generic_sync_restrict(sem_acquire_t, space_cluster_t, scope_cluster_t) +{ +// __sem == sem_acquire (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .release } +// .space = { .shared::cta } +// .scope = { .cluster } +template +__device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async_generic_sync_restrict(sem_release_t, space_shared_t, scope_cluster_t) +{ +// __sem == sem_release (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_GENERIC_SYNC_RESTRICT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h index 1e6119ee032..8988292b6d3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h @@ -19,21 +19,27 @@ _CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, sco { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 830 @@ -56,33 +62,39 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h new file mode 100644 index 00000000000..4930bec068b --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h @@ -0,0 +1,62 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_SYNC_RESTRICT_H_ +#define _CUDA_PTX_GENERATED_FENCE_SYNC_RESTRICT_H_ + +/* +// fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .acquire } +// .space = { .shared::cluster } +// .scope = { .cluster } +template +__device__ static inline void fence_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_sync_restrict(sem_acquire_t, space_cluster_t, scope_cluster_t) +{ +// __sem == sem_acquire (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.acquire.sync_restrict::shared::cluster.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .release } +// .space = { .shared::cta } +// .scope = { .cluster } +template +__device__ static inline void fence_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_sync_restrict(sem_release_t, space_shared_t, scope_cluster_t) +{ +// __sem == sem_release (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.release.sync_restrict::shared::cta.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_FENCE_SYNC_RESTRICT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h index da802adb9db..e5c8fa89225 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h @@ -133,13 +133,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%nwarpid;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%nwarpid;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -258,13 +260,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_S template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%nsmid;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%nsmid;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -293,17 +297,21 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supp template _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mov.pred P_OUT, %%is_explicit_cluster;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__sreg_value) : :); - return static_cast(__sreg_value);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mov.pred P_OUT, %%is_explicit_cluster;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__sreg_value) + : + :); + return static_cast(__sreg_value); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -317,13 +325,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%clusterid.x;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -337,13 +347,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%clusterid.y;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -357,13 +369,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%clusterid.z;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -377,13 +391,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_b template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%nclusterid.x;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -397,13 +413,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_b template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%nclusterid.y;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -417,13 +435,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_b template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%nclusterid.z;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -437,13 +457,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctaid.x;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -457,13 +479,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctaid.y;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -477,13 +501,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctaid.z;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -497,13 +523,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctaid.x;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -517,13 +545,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctaid.y;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -537,13 +567,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctaid.z;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -557,13 +589,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctarank;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctarank;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -577,13 +611,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctarank;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctarank;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -597,13 +633,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_eq;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_eq;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -617,13 +655,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_le;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_le;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -637,13 +677,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -657,13 +699,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_ge;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_ge;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -677,13 +721,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_gt;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -712,13 +758,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_befor template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%clock_hi;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%clock_hi;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 500 @@ -732,13 +780,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile("mov.u64 %0, %%clock64;" : "=l"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint64_t __sreg_value; + asm volatile("mov.u64 %0, %%clock64;" : "=l"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -752,13 +802,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint64_t __sreg_value; + asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 310 @@ -772,13 +824,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 310 @@ -792,13 +846,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 310 @@ -812,13 +868,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%total_smem_size;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%total_smem_size;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 410 @@ -832,13 +890,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%aggr_smem_size;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%aggr_smem_size;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 810 @@ -852,13 +912,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_suppor template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%dynamic_smem_size;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%dynamic_smem_size;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 410 @@ -872,13 +934,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_suppo template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_50, - (_CUDA_VSTD::uint64_t __sreg_value; asm("mov.u64 %0, %%current_graph_exec;" : "=l"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint64_t __sreg_value; + asm("mov.u64 %0, %%current_graph_exec;" : "=l"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h index 22bb73180dc..c78637db3e9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h @@ -4,7 +4,7 @@ #define _CUDA_PTX_GENERATED_GETCTARANK_H_ /* -// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 +// getctarank.space.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } template __device__ static inline uint32_t getctarank( @@ -16,15 +16,16 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90 template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr) { - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __dest; - asm("getctarank.shared::cluster.u32 %0, %1;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)) :); - return __dest;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("getctarank.shared::cluster.u32 %0, %1;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h new file mode 100644 index 00000000000..f93c8a62157 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h @@ -0,0 +1,33 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MAPA_H_ +#define _CUDA_PTX_GENERATED_MAPA_H_ + +/* +// mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 +// .space = { .shared::cluster } +template +__device__ static inline Tp* mapa( + cuda::ptx::space_cluster_t, + const Tp* addr, + uint32_t target_cta); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _Tp* mapa(space_cluster_t, const _Tp* __addr, _CUDA_VSTD::uint32_t __target_cta) +{ +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("mapa.shared::cluster.u32 %0, %1, %2;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)), "r"(__target_cta) :); + return __from_ptr_dsmem<_Tp>(__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mapa_is_not_supported_before_SM_90__(); + return __from_ptr_dsmem<_Tp>(0); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +#endif // _CUDA_PTX_GENERATED_MAPA_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h index c7102ebfdb5..5f7b23dbb68 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h @@ -14,14 +14,18 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_ template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; asm("mbarrier.arrive.shared.b64 %0, [%1]; " - " // 1. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)) : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared.b64 %0, [%1]; // 1. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 700 @@ -38,21 +42,23 @@ template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; " - "// 2. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), - "r"(__count) : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -71,29 +77,34 @@ mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VS { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -117,29 +128,34 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -156,21 +172,23 @@ template _CCCL_DEVICE static inline void mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; " - " // 4a. " : : "r"(__as_ptr_remote_dsmem(__addr)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " + : + : "r"(__as_ptr_remote_dsmem(__addr)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -188,18 +206,180 @@ template _CCCL_DEVICE static inline void mbarrier_arrive( sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; " - "// 4b. " : : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__count) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 +// .space = { .shared::cta } +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( + space_shared_t, + sem_relaxed_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + // __space == space_shared (due to parameter type constraint) + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.relaxed.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.relaxed.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 +// .space = { .shared::cta } +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +mbarrier_arrive(space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr) +{ + // __space == space_shared (due to parameter type constraint) + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.relaxed.cta.shared::cta.b64 %0, [%1];" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.relaxed.cluster.shared::cta.b64 %0, [%1];" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90 +// .space = { .shared::cluster } +// .sem = { .relaxed } +// .scope = { .cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive( + space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ +// __space == space_cluster (due to parameter type constraint) +// __sem == sem_relaxed (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90 +// .space = { .shared::cluster } +// .sem = { .relaxed } +// .scope = { .cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +mbarrier_arrive(space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr) +{ +// __space == space_cluster (due to parameter type constraint) +// __sem == sem_relaxed (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h index dc33b212e21..5cbcd4cb3aa 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h @@ -4,7 +4,7 @@ #define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ /* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -28,29 +28,34 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -68,18 +73,104 @@ template _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " : : "r"( - __as_ptr_remote_dsmem(__addr)), - "r"(__tx_count) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 +// .space = { .shared::cta } +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& txCount); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( + space_shared_t, + sem_relaxed_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __txCount) +{ + // __space == space_shared (due to parameter type constraint) + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90 +// .space = { .shared::cluster } +// .sem = { .relaxed } +// .scope = { .cluster } +template +__device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& txCount); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( + space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount) +{ +// __space == space_cluster (due to parameter type constraint) +// __sem == sem_relaxed (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h index 45c444c5364..2a9ebacf295 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h @@ -16,16 +16,18 @@ template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; " - "// 5. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), - "r"(__count) : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h new file mode 100644 index 00000000000..94d66b79a35 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h @@ -0,0 +1,94 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_EXPECT_TX_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_EXPECT_TX_H_ + +/* +// mbarrier.expect_tx.sem.scope.space.b64 [addr], txCount; // 1. PTX ISA 80, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline void mbarrier_expect_tx( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + uint32_t txCount); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_expect_tx( + sem_relaxed_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __txCount) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1; // 1." + : + : "r"(__as_ptr_smem(__addr)), "r"(__txCount) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.expect_tx.relaxed.cluster.shared::cta.b64 [%0], %1; // 1." + : + : "r"(__as_ptr_smem(__addr)), "r"(__txCount) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.expect_tx.sem.scope.space.b64 [addr], txCount; // 2. PTX ISA 80, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_expect_tx( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_cluster_t, + uint64_t* addr, + uint32_t txCount); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_expect_tx( + sem_relaxed_t, scope_t<_Scope> __scope, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __txCount) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.expect_tx.relaxed.cta.shared::cluster.b64 [%0], %1; // 2." + : + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.expect_tx.relaxed.cluster.shared::cluster.b64 [%0], %1; // 2." + : + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h index 6b3041de0d2..9ba345f8ff2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h @@ -15,12 +15,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM template _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(__addr)), "r"(__count) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(__addr)), "r"(__count) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__(); +# endif } #endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h index 9adc677c76d..53263270f0d 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h @@ -15,23 +15,26 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_befo template _CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "l"(__state) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 700 /* -// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX +// mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } @@ -50,31 +53,87 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_test_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_test_wait( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h index 1166b336d2d..3a281e22087 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h @@ -16,23 +16,26 @@ template _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "r"(__phaseParity) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 710 /* -// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX +// mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } @@ -51,30 +54,87 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_test_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_test_wait_parity( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h index 52fa5a4928a..c048136b87a 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h @@ -15,18 +15,21 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_befor template _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "l"(__state) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -44,25 +47,27 @@ template _CCCL_DEVICE static inline bool mbarrier_try_wait( _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "l"(__state), - "r"(__suspendTimeHint) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 /* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. -PTX ISA 80, SM_90 +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -80,36 +85,40 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. -PTX ISA 80, SM_90 +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -132,30 +141,147 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "l"(__state), + "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "l"(__state), + "r"(__suspendTimeHint) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h index aa15e255352..0d6f7d3a9df 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h @@ -16,18 +16,21 @@ template _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "r"(__phaseParity) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -45,25 +48,27 @@ template _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "r"(__phaseParity), - "r"(__suspendTimeHint) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 /* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. -PTX ISA 80, SM_90 +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -81,35 +86,40 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. -PTX ISA 80, SM_90 +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -132,30 +142,148 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // PTX ISA 86, +SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait_parity( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity), + "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity), + "r"(__suspendTimeHint) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait_parity( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h new file mode 100644 index 00000000000..51de5257bba --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h @@ -0,0 +1,2148 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_ +#define _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_ + +/* +// multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::uint32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.min.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::uint32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::uint64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.min.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::uint64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::int32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + asm("multimem.ld_reduce.weak.global.min.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::int32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::int64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + asm("multimem.ld_reduce.weak.global.min.s64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::int64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::uint32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.max.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::uint32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::uint64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.max.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::uint64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::int32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + asm("multimem.ld_reduce.weak.global.max.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::int32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::int64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + asm("multimem.ld_reduce.weak.global.max.s64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::int64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::uint32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.add.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::uint32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::uint64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.add.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::uint64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::int32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + asm("multimem.ld_reduce.weak.global.add.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::int32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::int64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + asm("multimem.ld_reduce.weak.global.add.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::int64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .and } +template = true> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B32 multimem_ld_reduce(sem_weak_t, op_and_op_t, const _B32* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.and.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B32 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, const _B32* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .or } +template = true> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B32 multimem_ld_reduce(sem_weak_t, op_or_op_t, const _B32* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.or.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B32 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, const _B32* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .xor } +template = true> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B32 multimem_ld_reduce(sem_weak_t, op_xor_op_t, const _B32* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.xor.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B32 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, const _B32* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .and } +template = true> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B64 multimem_ld_reduce(sem_weak_t, op_and_op_t, const _B64* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.and.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B64 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, const _B64* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .or } +template = true> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B64 multimem_ld_reduce(sem_weak_t, op_or_op_t, const _B64* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.or.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B64 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, const _B64* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .xor } +template = true> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B64 multimem_ld_reduce(sem_weak_t, op_xor_op_t, const _B64* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.xor.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B64 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, const _B64* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h new file mode 100644 index 00000000000..1ef97121d31 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h @@ -0,0 +1,1272 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MULTIMEM_RED_H_ +#define _CUDA_PTX_GENERATED_MULTIMEM_RED_H_ + +/* +// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::uint32_t* __addr, _CUDA_VSTD::uint32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::int32_t* __addr, _CUDA_VSTD::int32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::int64_t* __addr, _CUDA_VSTD::int64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::uint32_t* __addr, _CUDA_VSTD::uint32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::int32_t* __addr, _CUDA_VSTD::int32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::int64_t* __addr, _CUDA_VSTD::int64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::uint32_t* __addr, _CUDA_VSTD::uint32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::int32_t* __addr, _CUDA_VSTD::int32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::int64_t* __addr, _CUDA_VSTD::int64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_MULTIMEM_RED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h new file mode 100644 index 00000000000..91319874243 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h @@ -0,0 +1,186 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MULTIMEM_ST_H_ +#define _CUDA_PTX_GENERATED_MULTIMEM_ST_H_ + +/* +// multimem.st.sem.global.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .weak } +template = true> +__device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline void multimem_st(sem_weak_t, _B32* __addr, _B32 __val) +{ + // __sem == sem_weak (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("multimem.st.weak.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void multimem_st(sem_t<_Sem> __sem, scope_t<_Scope> __scope, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.st.relaxed.cta.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.st.relaxed.cluster.global.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.st.relaxed.gpu.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.st.relaxed.sys.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.st.release.cta.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.st.release.cluster.global.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.st.release.gpu.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.st.release.sys.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.st.sem.global.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .weak } +template = true> +__device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline void multimem_st(sem_weak_t, _B64* __addr, _B64 __val) +{ + // __sem == sem_weak (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("multimem.st.weak.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void multimem_st(sem_t<_Sem> __sem, scope_t<_Scope> __scope, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.st.relaxed.cta.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.st.relaxed.cluster.global.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.st.relaxed.gpu.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.st.relaxed.sys.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.st.release.cta.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.st.release.cluster.global.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.st.release.gpu.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.st.release.sys.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_MULTIMEM_ST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h index 74110933270..767411d4719 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h @@ -4,8 +4,8 @@ #define _CUDA_PTX_GENERATED_RED_ASYNC_H_ /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } template @@ -21,23 +21,23 @@ template _CCCL_DEVICE static inline void red_async( op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_inc (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } template @@ -53,23 +53,23 @@ template _CCCL_DEVICE static inline void red_async( op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_dec (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .min } template @@ -85,23 +85,23 @@ template _CCCL_DEVICE static inline void red_async( op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .max } template @@ -117,23 +117,23 @@ template _CCCL_DEVICE static inline void red_async( op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .add } template @@ -149,23 +149,23 @@ template _CCCL_DEVICE static inline void red_async( op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .s32 } // .op = { .min } template @@ -181,23 +181,23 @@ template _CCCL_DEVICE static inline void red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_s32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .s32 } // .op = { .max } template @@ -213,23 +213,23 @@ template _CCCL_DEVICE static inline void red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_s32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .s32 } // .op = { .add } template @@ -245,26 +245,26 @@ template _CCCL_DEVICE static inline void red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_s32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .b32 } // .op = { .and } -template +template = true> __device__ static inline void red_async( cuda::ptx::op_and_op_t, B32* dest, @@ -273,31 +273,31 @@ __device__ static inline void red_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) { // __type == type_b32 (due to parameter type constraint) // __op == op_and_op (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__as_b32(__value)), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .b32 } // .op = { .or } -template +template = true> __device__ static inline void red_async( cuda::ptx::op_or_op_t, B32* dest, @@ -306,31 +306,31 @@ __device__ static inline void red_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) { // __type == type_b32 (due to parameter type constraint) // __op == op_or_op (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__as_b32(__value)), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .b32 } // .op = { .xor } -template +template = true> __device__ static inline void red_async( cuda::ptx::op_xor_op_t, B32* dest, @@ -339,28 +339,28 @@ __device__ static inline void red_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) { // __type == type_b32 (due to parameter type constraint) // __op == op_xor_op (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__as_b32(__value)), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u64 } // .op = { .add } template @@ -376,22 +376,22 @@ template _CCCL_DEVICE static inline void red_async( op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "l"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } template @@ -407,16 +407,17 @@ template _CCCL_DEVICE static inline void red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar) { - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " - "intentional" : : "r"(__as_ptr_remote_dsmem(__dest)), - "l"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " + "intentional" + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h index e6c3fcf1737..e59208e59ba 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h @@ -4,7 +4,7 @@ #define _CUDA_PTX_GENERATED_ST_ASYNC_H_ /* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.type [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template @@ -19,28 +19,30 @@ template _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar) { static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.type [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template @@ -55,35 +57,37 @@ template _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar) { static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "l"(__as_b64(__value[0])), - "l"(__as_b64(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "l"(__as_b64(__value[0])), + "l"(__as_b64(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, -SM_90 template +SM_90 template = true> __device__ static inline void st_async( B32* addr, const B32 (&value)[4], @@ -91,22 +95,24 @@ __device__ static inline void st_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar) { static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // " - "3. " : : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_b32(__value[2])), - "r"(__as_b32(__value[3])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_b32(__value[2])), + "r"(__as_b32(__value[3])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h new file mode 100644 index 00000000000..bc02c785f86 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h @@ -0,0 +1,31 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_ST_BULK_H_ +#define _CUDA_PTX_GENERATED_ST_BULK_H_ + +/* +// st.bulk.weak.shared::cta [addr], size, initval; // PTX ISA 86, SM_100 +template +__device__ static inline void st_bulk( + void* addr, + uint64_t size, + cuda::ptx::n32_t initval); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_st_bulk_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void st_bulk(void* __addr, _CUDA_VSTD::uint64_t __size, n32_t<_N32> __initval) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("st.bulk.weak.shared::cta [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__addr)), "l"(__size), "n"(__initval.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_bulk_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_ST_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h new file mode 100644 index 00000000000..27ca2f86080 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h @@ -0,0 +1,105 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_ALLOC_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_ALLOC_H_ + +/* +// tcgen05.alloc.cta_group.sync.aligned.shared::cta.b32 [dst], nCols; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_alloc( + cuda::ptx::cta_group_t cta_group, + uint32_t* dst, + const uint32_t& nCols); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_alloc_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_alloc(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t* __dst, const _CUDA_VSTD::uint32_t& __nCols) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__dst)), "r"(__nCols) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__dst)), "r"(__nCols) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_alloc_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.dealloc.cta_group.sync.aligned.b32 taddr, nCols; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_dealloc( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + const uint32_t& nCols); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_dealloc_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_dealloc(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, const _CUDA_VSTD::uint32_t& __nCols) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.dealloc.cta_group::1.sync.aligned.b32 %0, %1;" : : "r"(__taddr), "r"(__nCols) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.dealloc.cta_group::2.sync.aligned.b32 %0, %1;" : : "r"(__taddr), "r"(__nCols) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_dealloc_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_relinquish_alloc_permit( + cuda::ptx::cta_group_t cta_group); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_relinquish_alloc_permit_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_relinquish_alloc_permit(cta_group_t<_Cta_Group> __cta_group) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;" : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_relinquish_alloc_permit_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_ALLOC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h new file mode 100644 index 00000000000..30865d000df --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h @@ -0,0 +1,81 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_COMMIT_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_COMMIT_H_ + +/* +// tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_commit( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_commit_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_commit(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint64_t* __smem_bar) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%0];" + : + : "r"(__as_ptr_dsmem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%0];" + : + : "r"(__as_ptr_dsmem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_commit_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_commit_multicast( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar, + uint16_t ctaMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_commit_multicast_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_commit_multicast( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint64_t* __smem_bar, _CUDA_VSTD::uint16_t __ctaMask) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%0], %1;" + : + : "r"(__as_ptr_dsmem(__smem_bar)), "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%0], %1;" + : + : "r"(__as_ptr_dsmem(__smem_bar)), "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_commit_multicast_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_COMMIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h new file mode 100644 index 00000000000..e213f9ba745 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h @@ -0,0 +1,612 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_CP_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_CP_H_ + +/* +// tcgen05.cp.cta_group.128x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x256b_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_cp_128x256b(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.4x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_4x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_4x256b_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_cp_4x256b(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.4x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.4x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_4x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x128b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x128b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x128b_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_cp_128x128b(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x128b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x128b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::02_13 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_02_13( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_02_13( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::01_23 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_01_23( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_01_23( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.32x128b.warpx4 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_32x128b_warpx4( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_32x128b_warpx4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_32x128b_warpx4( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.32x128b.warpx4 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.32x128b.warpx4 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_32x128b_warpx4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.4x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_4x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_4x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x128b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x128b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x128b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.4x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_4x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_4x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x128b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x128b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x128b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_CP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h new file mode 100644 index 00000000000..efedcf86a57 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h @@ -0,0 +1,44 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_FENCE_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_FENCE_H_ + +/* +// tcgen05.fence::before_thread_sync; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_fence_before_thread_sync(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_fence_before_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_fence_before_thread_sync() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.fence::before_thread_sync;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_fence_before_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.fence::after_thread_sync; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_fence_after_thread_sync(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_fence_after_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_fence_after_thread_sync() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.fence::after_thread_sync;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_fence_after_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h new file mode 100644 index 00000000000..e5ec1b686c2 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h @@ -0,0 +1,4446 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_LD_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_LD_H_ + +/* +// tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x1.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x2.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x4.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x1.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x2.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x4.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x2.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x4.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x1.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x2.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x4.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x1.b32 {%0}, [%1], %2;" + : "=r"(__out[0]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 {%0}, [%1], %2;" + : "=r"(__out[0]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x2.b32 {%0, %1}, [%2], %3;" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 {%0, %1}, [%2], %3;" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x4.b32 {%0, %1, %2, %3}, [%4], %5;" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4], %5;" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8], %9;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8], %9;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, " + "%15}, [%16], %17;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16], %17;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32], %33;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32], %33;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64], %65;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64], %65;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128], %129;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128], %129;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_LD_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h new file mode 100644 index 00000000000..58e3f1e8363 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h @@ -0,0 +1,3842 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_MMA_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_MMA_H_ + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // +PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // +PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_MMA_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h new file mode 100644 index 00000000000..8d09698052d --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h @@ -0,0 +1,6438 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_MMA_WS_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_MMA_WS_H_ + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_MMA_WS_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h new file mode 100644 index 00000000000..0c28ba5d888 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h @@ -0,0 +1,36 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_SHIFT_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_SHIFT_H_ + +/* +// tcgen05.shift.cta_group.down [taddr]; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_shift_down( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_shift_down_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_shift_down(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.shift.cta_group::1.down [%0];" : : "r"(__taddr) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.shift.cta_group::2.down [%0];" : : "r"(__taddr) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_shift_down_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_SHIFT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h new file mode 100644 index 00000000000..83e9d13810e --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h @@ -0,0 +1,4554 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_ST_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_ST_H_ + +/* +// tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x1.b32 [%0], {%1};" : : "r"(__taddr), "r"(__as_b32(__values[0])) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [%0], {%1};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x2.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x4.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x64.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x128.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x1.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x2.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x4.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x64.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x1.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x2.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x4.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x1.b32 [%0], {%1};" : : "r"(__taddr), "r"(__as_b32(__values[0])) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [%0], {%1};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x2.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x4.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x64.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x128.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x1.b32 [%0], %1, {%2};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [%0], %1, {%2};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x2.b32 [%0], %1, {%2, %3};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [%0], %1, {%2, %3};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x4.b32 [%0], %1, {%2, %3, %4, %5};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x8.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x16.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, " + "%15, %16, %17};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x32.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x64.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x128.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, " + "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, " + "%37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, " + "%59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, " + "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, " + "%103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, " + "%122, %123, %124, %125, %126, %127, %128, %129};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128, %129};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_ST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h new file mode 100644 index 00000000000..5f683c07fea --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h @@ -0,0 +1,44 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_WAIT_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_WAIT_H_ + +/* +// tcgen05.wait::ld.sync.aligned; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_wait_ld(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_wait_ld_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_wait_ld() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.wait::ld.sync.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_wait_ld_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.wait::st.sync.aligned; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_wait_st(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_wait_st_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_wait_st() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.wait::st.sync.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_wait_st_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h index b51b5185db0..db5e7dde640 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h @@ -24,37 +24,43 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h index 598b56f90b0..53c56e159f7 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h @@ -4,113 +4,127 @@ #define _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ /* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_global_t, void* tm_addr, B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "l"(__as_b64(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_shared_t, void* tm_addr, B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "l"(__as_b64(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_global_t, void* tm_addr, B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_shared_t, void* tm_addr, B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -118,28 +132,30 @@ __device__ static inline void tensormap_replace_box_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -147,29 +163,30 @@ __device__ static inline void tensormap_replace_box_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm( - "tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -177,28 +194,30 @@ __device__ static inline void tensormap_replace_global_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -206,29 +225,30 @@ __device__ static inline void tensormap_replace_global_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm( - "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_global_t, void* tm_addr, @@ -236,28 +256,31 @@ __device__ static inline void tensormap_replace_global_stride( B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__ord.value), - "l"(__as_b64(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_shared_t, void* tm_addr, @@ -265,29 +288,98 @@ __device__ static inline void tensormap_replace_global_stride( B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm( - "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__ord.value), - "l"(__as_b64(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a // .space = { .global } -template +template = true> +__device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tensormap_replace_element_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a +// .space = { .shared::cta } +template = true> +__device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tensormap_replace_element_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a +// .space = { .global } +template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_global_t, void* tm_addr, @@ -295,28 +387,32 @@ __device__ static inline void tensormap_replace_element_size( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_shared_t, void* tm_addr, @@ -324,27 +420,29 @@ __device__ static inline void tensormap_replace_element_size( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm( - "tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_elemtype( @@ -353,23 +451,26 @@ __device__ static inline void tensormap_replace_elemtype( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_elemtype( @@ -378,23 +479,26 @@ __device__ static inline void tensormap_replace_elemtype( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_interleave_layout( @@ -403,24 +507,28 @@ __device__ static inline void tensormap_replace_interleave_layout( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_interleave_layout( @@ -429,24 +537,28 @@ __device__ static inline void tensormap_replace_interleave_layout( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -455,23 +567,27 @@ __device__ static inline void tensormap_replace_swizzle_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -480,23 +596,27 @@ __device__ static inline void tensormap_replace_swizzle_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_fill_mode( @@ -505,23 +625,26 @@ __device__ static inline void tensormap_replace_fill_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_fill_mode( @@ -530,19 +653,78 @@ __device__ static inline void tensormap_replace_fill_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 +/* +// tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a +// .space = { .global } +template +__device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_swizzle_atomicity(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_swizzle_atomicity(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h new file mode 100644 index 00000000000..6f5a022dbc8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h @@ -0,0 +1,61 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_barrier_cluster_aligned(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.release.aligned; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.relaxed.aligned; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait.acquire.aligned; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h new file mode 100644 index 00000000000..c5df06bc787 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h @@ -0,0 +1,84 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_clusterlaunchcontrol(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], + // [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_try_cancel));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 + // [addr], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_try_cancel_multicast));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 + // [addr], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_try_cancel_multicast));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_query_cancel_is_canceled));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_y));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_z));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h index a342954591a..de118140440 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h @@ -20,20 +20,30 @@ __global__ void test_cp_async_bulk(void** fn_ptr) NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // - // 1a. unicast + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, - // [rdsmem_bar]; // 2. + // [rdsmem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -44,10 +54,21 @@ __global__ void test_cp_async_bulk(void** fn_ptr) NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. + // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_100, + ( + // cp.async.bulk.global.shared::cta.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_cp_mask));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h index 6e2a986e7bd..81298beb481 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h @@ -21,7 +21,33 @@ __global__ void test_cp_async_bulk_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], - // size, [smem_bar], ctaMask; // 1. + // size, [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], + // size, [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], + // size, [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. + // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -44,18 +105,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1b. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. + // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -67,18 +189,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1c. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. + // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -90,18 +273,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1d. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. + // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -113,18 +357,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1e. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. + // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h new file mode 100644 index 00000000000..930cfa09125 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h @@ -0,0 +1,180 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_bulk_tensor_gather_scatter(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_100, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor_tile_scatter4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor_tile_scatter4));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h index 617bc9507bd..3f3a08764d2 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h @@ -21,7 +21,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 #if __cccl_ptx_isa >= 800 @@ -37,7 +116,20 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 #if __cccl_ptx_isa >= 800 @@ -53,7 +211,33 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h new file mode 100644 index 00000000000..663c07b4121 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_mbarrier_arrive(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET( + NV_PROVIDES_SM_80, + ( + // cp.async.mbarrier.arrive.b64 [addr]; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h new file mode 100644 index 00000000000..a089c727903 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_mbarrier_arrive_noinc(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // cp.async.mbarrier.arrive.noinc.b64 [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_mbarrier_arrive_noinc));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h new file mode 100644 index 00000000000..298225881d1 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_elect_sync(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // elect.sync _|is_elected, membermask; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::elect_sync));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h index aecfcde5e01..0738677ed33 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h @@ -28,10 +28,24 @@ __global__ void test_fence(void** fn_ptr) static_cast(cuda::ptx::fence)); // fence.sc.sys; // 1. * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cta; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 600 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // fence.sc.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 600 + NV_IF_TARGET( + NV_PROVIDES_SM_70, + ( + // fence.acq_rel.cta; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); // fence.acq_rel.gpu; // 1. * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::fence)); @@ -41,14 +55,46 @@ __global__ void test_fence(void** fn_ptr) #endif // __cccl_ptx_isa >= 600 #if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // fence.acq_rel.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 860 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // fence.sc.cluster; // 2. + // fence.acquire.cta; * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cluster; // 2. + static_cast(cuda::ptx::fence)); + // fence.acquire.cluster; * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence));)); -#endif // __cccl_ptx_isa >= 780 + static_cast(cuda::ptx::fence)); + // fence.acquire.gpu; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acquire.sys; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.release.cta; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.release.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.release.gpu; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.release.sys; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h new file mode 100644 index 00000000000..7af3a09ad2b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h @@ -0,0 +1,38 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_fence_proxy_async_generic_sync_restrict(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_proxy_async_generic_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_proxy_async_generic_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h new file mode 100644 index 00000000000..c673d840428 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h @@ -0,0 +1,38 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_fence_sync_restrict(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.acquire.sync_restrict::shared::cluster.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.release.sync_restrict::shared::cta.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h new file mode 100644 index 00000000000..9160be1fe2d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h @@ -0,0 +1,27 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mapa(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mapa.shared::cluster.u32 dest, addr, target_cta; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mapa));)); +#endif // __cccl_ptx_isa >= 780 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h index 3cddcb3b54c..d32773c118d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h @@ -87,4 +87,60 @@ __global__ void test_mbarrier_arrive(void** fn_ptr) cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h index a2ef4b619bb..8ef925662ac 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h @@ -44,4 +44,33 @@ __global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive_expect_tx));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 state, [addr], txCount; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx)); + // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 state, [addr], txCount; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [addr], txCount; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h new file mode 100644 index 00000000000..8dd3b6a2037 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h @@ -0,0 +1,50 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_expect_tx(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [addr], txCount; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx)); + // mbarrier.expect_tx.relaxed.cluster.shared::cta.b64 [addr], txCount; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.expect_tx.relaxed.cta.shared::cluster.b64 [addr], txCount; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx)); + // mbarrier.expect_tx.relaxed.cluster.shared::cluster.b64 [addr], txCount; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h new file mode 100644 index 00000000000..c9c0d0d14fb --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h @@ -0,0 +1,55 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_test_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 700 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait)); + // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.relaxed.cta.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait)); + // mbarrier.test_wait.relaxed.cluster.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h new file mode 100644 index 00000000000..f44c0554308 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h @@ -0,0 +1,55 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 710 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 710 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity)); + // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity)); + // mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h index 00166f8172c..1a1b347751c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h @@ -66,4 +66,35 @@ __global__ void test_mbarrier_try_wait(void** fn_ptr) cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint64_t&, const uint32_t&)>( cuda::ptx::mbarrier_try_wait));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.relaxed.cta.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.relaxed.cluster.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.relaxed.cta.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.relaxed.cluster.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h index 8aa588fbab0..4a5ef3e926f 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h @@ -65,4 +65,36 @@ __global__ void test_mbarrier_try_wait_parity(void** fn_ptr) cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&, const uint32_t&)>( cuda::ptx::mbarrier_try_wait_parity));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 waitComplete, [addr], phaseParity, + // suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h deleted file mode 100644 index 80129e5016c..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h +++ /dev/null @@ -1,24 +0,0 @@ -__global__ void test_mbarrier_test_wait(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 700 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait)); - // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h deleted file mode 100644 index 30902c58905..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h +++ /dev/null @@ -1,24 +0,0 @@ -__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 710 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 710 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity)); - // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h new file mode 100644 index 00000000000..c0259451a1b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h @@ -0,0 +1,1020 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_multimem_ld_reduce(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.s32 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.s64 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.s32 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.s64 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.s32 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.u64 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h new file mode 100644 index 00000000000..dd0011e3fb2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h @@ -0,0 +1,840 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_multimem_red(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h new file mode 100644 index 00000000000..b61c25430ed --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h @@ -0,0 +1,110 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_multimem_st(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // multimem.st.weak.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.st.relaxed.cta.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.cluster.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.gpu.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.sys.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cta.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cluster.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.gpu.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.sys.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // multimem.st.weak.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.st.relaxed.cta.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.cluster.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.gpu.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.sys.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cta.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cluster.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.gpu.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.sys.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h new file mode 100644 index 00000000000..d9203b625e8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_st_bulk(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // st.bulk.weak.shared::cta [addr], size, initval; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::st_bulk));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h new file mode 100644 index 00000000000..48a40f6f23c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h @@ -0,0 +1,81 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_alloc(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc)); + // tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc)); + // tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.dealloc.cta_group::1.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc)); + // tcgen05.dealloc.cta_group::2.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.dealloc.cta_group::1.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc)); + // tcgen05.dealloc.cta_group::2.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit)); + // tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit)); + // tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h new file mode 100644 index 00000000000..c41981e6917 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h @@ -0,0 +1,62 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_commit(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit_multicast)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], + // ctaMask; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_commit_multicast));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit_multicast)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], + // ctaMask; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_commit_multicast));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h new file mode 100644 index 00000000000..4c37cb11cfa --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h @@ -0,0 +1,396 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_cp(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b)); + // tcgen05.cp.cta_group::2.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b)); + // tcgen05.cp.cta_group::2.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b)); + // tcgen05.cp.cta_group::2.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b)); + // tcgen05.cp.cta_group::2.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b)); + // tcgen05.cp.cta_group::2.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b)); + // tcgen05.cp.cta_group::2.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_32x128b_warpx4)); + // tcgen05.cp.cta_group::2.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_32x128b_warpx4)); + // tcgen05.cp.cta_group::2.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h new file mode 100644 index 00000000000..75b2ec35fa5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h @@ -0,0 +1,44 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_fence(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.fence::before_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_before_thread_sync));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.fence::before_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_before_thread_sync));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.fence::after_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_after_thread_sync));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.fence::after_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_after_thread_sync));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h new file mode 100644 index 00000000000..48ecce5869e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h @@ -0,0 +1,1012 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_ld(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h new file mode 100644 index 00000000000..7146c395fa7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h @@ -0,0 +1,2928 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_mma(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h new file mode 100644 index 00000000000..7e1674f39fc --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h @@ -0,0 +1,3570 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_mma_ws(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h new file mode 100644 index 00000000000..293d2787a87 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h @@ -0,0 +1,39 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_shift(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.shift.cta_group::1.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down)); + // tcgen05.shift.cta_group::2.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.shift.cta_group::1.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down)); + // tcgen05.shift.cta_group::2.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h new file mode 100644 index 00000000000..ec8cb758e5d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h @@ -0,0 +1,1012 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_st(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[1])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[1])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[1])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[1])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[2])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[2])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[2])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[2])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[4])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[4])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[4])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[4])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[8])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[8])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[8])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[8])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h new file mode 100644 index 00000000000..424d884049c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h @@ -0,0 +1,40 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.wait::ld.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_ld));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.wait::ld.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_ld));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.wait::st.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_st));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.wait::st.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_st));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h index 95446eb81fa..1439bc84bd0 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h @@ -20,7 +20,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast( cuda::ptx::tensormap_replace_global_address));)); #endif // __cccl_ptx_isa >= 830 @@ -29,7 +41,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast( cuda::ptx::tensormap_replace_global_address));)); #endif // __cccl_ptx_isa >= 830 @@ -38,7 +62,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::tensormap_replace_rank));)); #endif // __cccl_ptx_isa >= 830 @@ -47,7 +83,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::tensormap_replace_rank));)); #endif // __cccl_ptx_isa >= 830 @@ -56,7 +104,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_box_dim));)); @@ -66,7 +128,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_box_dim));)); @@ -76,7 +152,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_global_dim));)); @@ -86,7 +176,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_global_dim));)); @@ -96,7 +200,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int64_t)>( cuda::ptx::tensormap_replace_global_stride));)); @@ -106,17 +224,93 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int64_t)>( cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); #endif // __cccl_ptx_isa >= 830 #if __cccl_ptx_isa >= 830 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_element_size));)); @@ -126,7 +320,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_element_size));)); @@ -136,7 +344,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_elemtype));)); @@ -146,7 +368,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_elemtype));)); @@ -160,6 +396,20 @@ __global__ void test_tensormap_replace(void** fn_ptr) * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); #endif // __cccl_ptx_isa >= 830 #if __cccl_ptx_isa >= 830 @@ -170,13 +420,41 @@ __global__ void test_tensormap_replace(void** fn_ptr) * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); #endif // __cccl_ptx_isa >= 830 #if __cccl_ptx_isa >= 830 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_swizzle_mode));)); @@ -186,7 +464,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_swizzle_mode));)); @@ -196,7 +488,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_fill_mode));)); @@ -206,9 +512,57 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_fill_mode));)); #endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h b/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h new file mode 100644 index 00000000000..fef34f25ef4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// NVRTC ships a built-in copy of , so including CCCL's version of this header will omit the +// content since the header guards are already defined. To make older NVRTC versions have a few newer feature macros +// required for the PTX tests, we define them here outside the header guards. +// TODO(bgruber): limit this workaround to NVRTC versions older than the first one shipping those macros +#ifdef __CUDACC_RTC__ +# ifndef NV_HAS_FEATURE_SM_100a +# define NV_HAS_FEATURE_SM_100a __NV_HAS_FEATURE_SM_100a +# if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && defined(__CUDA_ARCH_FEAT_SM100_ALL)) +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_100a 1 +# else +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_100a 0 +# endif +# endif // NV_HAS_FEATURE_SM_100a + +// Re-enable sm_101a support in nvcc. +# ifndef NV_HAS_FEATURE_SM_101a +# define NV_HAS_FEATURE_SM_101a __NV_HAS_FEATURE_SM_101a +# if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1010) && defined(__CUDA_ARCH_FEAT_SM101_ALL)) +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_101a 1 +# else +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_101a 0 +# endif +# endif // NV_HAS_FEATURE_SM_101a +#endif // __CUDACC_RTC__ diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp index 33d08621ef4..003d8f97017 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/barrier_cluster.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp index e7ff21c2730..1bf931109ed 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_commit_group.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp index fdd35749cc6..be56b1b922c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp index ae1546828ae..226dbe5cf47 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp @@ -16,6 +16,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_multicast.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp index eeb7b4bf5a5..42bc5b8e355 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_tensor.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp index d07351a2275..65172d72897 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp @@ -16,6 +16,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_tensor_multicast.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp index 87910d04941..b31a9fb6a81 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_wait_group.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp index 8b916d74bf9..76a9357ae2f 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_reduce_async_bulk.h" #ifdef _LIBCUDACXX_HAS_NVF16 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp index f6a6fd61735..289f3dd9411 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_reduce_async_bulk_tensor.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp index 56f54b345f7..c439720b8f8 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/fence.h" #include "generated/fence_mbarrier_init.h" #include "generated/fence_proxy_alias.h" diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp index 91a6dd94bf1..adf6bb3e769 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp @@ -15,6 +15,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/get_sreg.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp index ed39816b7d6..9935b0563d2 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/getctarank.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp index 93263910906..a0948e86b18 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/mbarrier_arrive.h" #include "generated/mbarrier_arrive_expect_tx.h" #include "generated/mbarrier_arrive_no_complete.h" diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp index 7af0db56b70..0583b4f6e29 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/mbarrier_init.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp index 896abb8a7d8..732db4f16a1 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp @@ -14,10 +14,12 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header +#include "generated/mbarrier_test_wait.h" +#include "generated/mbarrier_test_wait_parity.h" #include "generated/mbarrier_try_wait.h" #include "generated/mbarrier_try_wait_parity.h" -#include "generated/mbarrier_wait.h" -#include "generated/mbarrier_wait_parity.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp index c6f66503b1f..2993ba3893d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/red_async.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp index 7c008b77126..a833a3770f4 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/st_async.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp index bb5578fc730..5d8566be5b5 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/tensormap_cp_fenceproxy.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp index 264b7956fbb..f0c91aa2296 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/tensormap_replace.h" int main(int, char**) From 671ee2fc626f609bfae3f744ff4fb3d6b7109d47 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Wed, 29 Jan 2025 19:44:57 -0500 Subject: [PATCH 10/33] Update CI matrix to use NVKS nodes. (#3572) * Update CI matrix to use NVKS nodes. * Update windows CI scripts to accept -arch. * Move all non-Catch2 device algo tests to lid0/lid1. This makes sure that they run in the correct CI config on appropriate hardware. * Switch to all rtx queues: CUB -> RTXA6000 (48GiB) Thrust -> RTX4090 (24GiB) Others -> RTX2080 (8GiB) --- ci/matrix.yaml | 73 ++++++++++++++------------------- ci/windows/build_common.psm1 | 15 ++++++- ci/windows/build_cub.ps1 | 8 +++- ci/windows/build_cudax.ps1 | 8 +++- ci/windows/build_libcudacxx.ps1 | 8 +++- ci/windows/build_thrust.ps1 | 8 +++- ci/windows/test_thrust.ps1 | 8 +++- cub/test/CMakeLists.txt | 9 ++++ 8 files changed, 83 insertions(+), 54 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index c3f03d323ab..5ec715fb59b 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -19,49 +19,51 @@ workflows: - {jobs: ['build'], std: 'max', cxx: ['msvc2019']} - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']} # Current CTK testing: - - {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']} + - {jobs: ['test'], project: ['thrust'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx4090'} + - {jobs: ['test'], project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080'} # Disabled until we figure out the issue with the TBB dll - #- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']} + #- {jobs: ['test'], project: ['thrust'], std: 'max', cxx: ['msvc'], gpu: 'rtx4090'} + - {jobs: ['test'], project: ['libcudacxx'], std: 'max', cxx: ['msvc'], gpu: 'rtx2080'} # Split up cub tests: - - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']} - - {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']} - - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc']} - - {jobs: ['test_lid0'], project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100', sm: 'gpu' } + - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc'], gpu: 'rtxa6000'} + - {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'], gpu: 'rtxa6000'} + - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 'rtxa6000'} + - {jobs: ['test_lid0'], project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100', sm: 'gpu' } # Modded builds: - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'} - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'} - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'} # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly. - - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'} # default_projects: clang-cuda - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'} - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'} - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'} # nvrtc: - - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all'} + - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all', gpu: 'rtx2080', sm: 'gpu'} # verify-codegen: - {jobs: ['verify_codegen'], project: 'libcudacxx'} # cudax has different CTK reqs: - - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20, cxx: ['msvc14.36']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc10', 'gcc11', 'gcc12']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['clang14', 'clang15', 'clang16', 'clang17']} + - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20, cxx: ['msvc14.36']} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc10', 'gcc11', 'gcc12']} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: 'all', cxx: ['nvhpc']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['msvc2022']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17, cxx: ['gcc'], sm: "90"} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc'], sm: "90a"} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['msvc2022']} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17, cxx: ['gcc'], sm: "90"} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc'], sm: "90a"} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'} - - {jobs: ['test'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc12', 'clang', 'msvc']} + - {jobs: ['test'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc12', 'clang', 'msvc'], gpu: 'rtx2080'} # Python and c/parallel jobs: - - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'} + - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6', gpu: 'rtx2080'} # cccl-infra: - - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']} - - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']} + - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'} + - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang'], gpu: 'rtx2080'} nightly: # Edge-case jobs - - {jobs: ['limited'], project: 'cub', std: 17} - - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} - - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'} + - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'rtx4090'} # Old CTK/compiler - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']} - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'} @@ -70,7 +72,11 @@ workflows: - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], std: 'all', cxx: ['msvc2019']} # Test current CTK - - {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']} + - {jobs: ['test'], project: 'cub', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'} + - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'v100'} + - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'h100', sm: 'gpu' } + - {jobs: ['test'], project: 'thrust', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'} + - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'} # Modded builds: - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'} - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'} @@ -88,26 +94,9 @@ workflows: - {jobs: ['build'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['gcc12'], sm: "90"} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13'], sm: "90a"} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'} - - {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']} - - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14']} - - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18']} - -# # These are waiting on the NVKS nodes: -# - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc7', std: [11]} -# - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang14', std: [17]} -# - {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17]} -# - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14]} -# - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all'} -# - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang14', std: [11]} -# # H100 runners are currently flakey, only build since those use CPU-only runners: -# - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]} -# - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang18', std: [17]} -# -# # nvrtc: -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']} -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']} -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all', project: ['libcudacxx']} -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc13', std: [11, 20], project: ['libcudacxx']} + - {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12'] , gpu: 'rtx2080'} + - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080'} + - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18'], gpu: 'rtx2080'} # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows. exclude: diff --git a/ci/windows/build_common.psm1 b/ci/windows/build_common.psm1 index 1eb5f1a9d63..151bb1f112e 100644 --- a/ci/windows/build_common.psm1 +++ b/ci/windows/build_common.psm1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -20,6 +24,12 @@ if ($script:CL_VERSION_STRING -match "Version (\d+\.\d+)\.\d+") { Write-Host "Detected cl.exe version: $CL_VERSION" } +$script:GLOBAL_CMAKE_OPTIONS = "" +if ($CUDA_ARCH -ne 0) { + $script:GLOBAL_CMAKE_OPTIONS += "-DCMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH" +} + + if (-not $env:CCCL_BUILD_INFIX) { $env:CCCL_BUILD_INFIX = "" } @@ -56,6 +66,7 @@ Write-Host "NVCC_VERSION=$NVCC_VERSION" Write-Host "CMAKE_BUILD_PARALLEL_LEVEL=$env:CMAKE_BUILD_PARALLEL_LEVEL" Write-Host "CTEST_PARALLEL_LEVEL=$env:CTEST_PARALLEL_LEVEL" Write-Host "CCCL_BUILD_INFIX=$env:CCCL_BUILD_INFIX" +Write-Host "GLOBAL_CMAKE_OPTIONS=$script:GLOBAL_CMAKE_OPTIONS" Write-Host "Current commit is:" Write-Host "$(git log -1 --format=short)" Write-Host "========================================" @@ -82,7 +93,7 @@ function configure_preset { pushd ".." # Echo and execute command to stdout: - $configure_command = "cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE" + $configure_command = "cmake --preset $PRESET $script:GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS --log-level VERBOSE" Write-Host $configure_command Invoke-Expression $configure_command $test_result = $LastExitCode diff --git a/ci/windows/build_cub.ps1 b/ci/windows/build_cub.ps1 index 32e4f71ee9a..27c5360ded9 100644 --- a/ci/windows/build_cub.ps1 +++ b/ci/windows/build_cub.ps1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { pushd "$PSScriptRoot/.." } -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "cub-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" diff --git a/ci/windows/build_cudax.ps1 b/ci/windows/build_cudax.ps1 index ca7bd578291..7b8cd0ff771 100644 --- a/ci/windows/build_cudax.ps1 +++ b/ci/windows/build_cudax.ps1 @@ -4,7 +4,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(20)] - [int]$CXX_STANDARD = 20 + [int]$CXX_STANDARD = 20, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $CURRENT_PATH = Split-Path $pwd -leaf @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { } Remove-Module -Name build_common -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "cudax-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" diff --git a/ci/windows/build_libcudacxx.ps1 b/ci/windows/build_libcudacxx.ps1 index a57e2280de7..2f80619f76b 100644 --- a/ci/windows/build_libcudacxx.ps1 +++ b/ci/windows/build_libcudacxx.ps1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { pushd "$PSScriptRoot/.." } -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $GPU_ARCHS +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "libcudacxx-cpp${CXX_STANDARD}" $CMAKE_OPTIONS = "" diff --git a/ci/windows/build_thrust.ps1 b/ci/windows/build_thrust.ps1 index 186ed94eace..bda86859fd4 100644 --- a/ci/windows/build_thrust.ps1 +++ b/ci/windows/build_thrust.ps1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { pushd "$PSScriptRoot/.." } -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "thrust-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" diff --git a/ci/windows/test_thrust.ps1 b/ci/windows/test_thrust.ps1 index 7c020714208..eabda06df5b 100644 --- a/ci/windows/test_thrust.ps1 +++ b/ci/windows/test_thrust.ps1 @@ -5,6 +5,10 @@ Param( [ValidateSet(11, 14, 17, 20)] [int]$CXX_STANDARD = 17, [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0, + [Parameter(Mandatory = $false)] [Alias("cpu-only")] [switch]$CPU_ONLY = $false ) @@ -24,11 +28,11 @@ If($CURRENT_PATH -ne "ci") { } # Execute the build script: -$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD" +$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD -arch $CUDA_ARCH" Write-Host "Executing: $build_command" Invoke-Expression $build_command -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module -Name "$PSScriptRoot/build_common.psm1" -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "thrust-cpu-cpp$CXX_STANDARD" diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 5a093526edd..aaab1984e21 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -370,6 +370,15 @@ foreach (test_src IN LISTS test_srcs) set(launcher 0) endif() + # FIXME: There are a few remaining device algorithm tests that have not been ported to + # use Catch2 and lid variants. Mark these as `lid_0/1` so they'll run in the appropriate + # CI configs: + string(REGEX MATCH "^device_" is_device_test "${test_name}") + _cub_is_fail_test(is_fail_test "%{test_name}") + if (is_device_test AND NOT is_fail_test) + string(APPEND test_name ".lid_${launcher}") + endif() + # Only one version of this test. cub_add_test(test_target ${test_name} "${test_src}" ${cub_target} ${launcher}) cub_configure_cuda_target(${test_target} RDC ${CUB_FORCE_RDC}) From 0c17dbd005a934ffe2f83cf0b73a6a9aa5383852 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Wed, 29 Jan 2025 17:28:19 -0800 Subject: [PATCH 11/33] Deprecate and replace `CUB_IS_INT128_ENABLED` (#3427) Co-authored-by: Bernhard Manfred Gruber --- cub/cub/detail/fast_modulo_division.cuh | 6 +++--- .../device/dispatch/dispatch_histogram.cuh | 12 +++++------ .../tuning/tuning_run_length_encode.cuh | 8 ++++---- .../device/dispatch/tuning/tuning_scan.cuh | 4 ++-- .../dispatch/tuning/tuning_scan_by_key.cuh | 20 +++++++++---------- .../dispatch/tuning/tuning_select_if.cuh | 16 +++++++-------- cub/cub/util_ptx.cuh | 2 +- cub/cub/util_type.cuh | 13 ++---------- .../catch2_test_device_for_each_in_extents.cu | 4 ++-- cub/test/catch2_test_printing.cu | 2 +- cub/test/internal/catch2_test_fast_div_mod.cu | 2 +- cub/test/test_util.h | 2 +- 12 files changed, 41 insertions(+), 50 deletions(-) diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh index 4a5f2048e32..09068d87be0 100644 --- a/cub/cub/detail/fast_modulo_division.cuh +++ b/cub/cub/detail/fast_modulo_division.cuh @@ -38,7 +38,7 @@ #endif // no system header #include // implicit_prom_t -#include // CUB_IS_INT128_ENABLED +#include // _CCCL_HAS_INT128() #include // cuda::std::ceil_div #include // std::has_single_bit @@ -79,7 +79,7 @@ struct larger_unsigned_type using type = ::cuda::std::uint64_t; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct larger_unsigned_type::type> @@ -87,7 +87,7 @@ struct larger_unsigned_type using type = __uint128_t; }; -#endif // CUB_IS_INT128_ENABLED +#endif // _CCCL_HAS_INT128() template using larger_unsigned_type_t = typename larger_unsigned_type::type; diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 2ac4e160220..2c2d0a2a9ca 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -646,27 +646,27 @@ public: using IntArithmeticT = ::cuda::std::_If< // sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), // uint32_t, // -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() ::cuda::std::_If< // (::cuda::std::is_same::value || // ::cuda::std::is_same::value), // CommonT, // uint64_t> // -#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv +#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv uint64_t -#endif // !CUB_IS_INT128_ENABLED +#endif // !_CCCL_HAS_INT128() >; // Alias template that excludes __[u]int128 from the integral types template using is_integral_excl_int128 = -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() ::cuda::std::_If<::cuda::std::is_same::value&& ::cuda::std::is_same::value, ::cuda::std::false_type, ::cuda::std::is_integral>; -#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv +#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv ::cuda::std::is_integral; -#endif // !CUB_IS_INT128_ENABLED +#endif // !_CCCL_HAS_INT128() union ScaleT { diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index d938209dcf2..12f07f3f366 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -156,7 +156,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -216,7 +216,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -349,7 +349,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -414,7 +414,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 7b076507341..165a17cae52 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -175,7 +175,7 @@ struct sm80_tuning struct sm80_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> { @@ -221,7 +221,7 @@ template struct sm90_tuning struct sm90_tuning : sm90_tuning_vals {}; template <> struct sm90_tuning : sm90_tuning_vals {}; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : sm90_tuning_vals<__int128_t, 576, 21, 860, 630> {}; template <> struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index f8e29201eea..2bc31ef6697 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -172,7 +172,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -229,7 +229,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -286,7 +286,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -343,7 +343,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -400,7 +400,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -465,7 +465,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -522,7 +522,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -579,7 +579,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -636,7 +636,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -693,7 +693,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 10d22286068..c1b74b4ae09 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -121,7 +121,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -174,7 +174,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -227,7 +227,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -280,7 +280,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -336,7 +336,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -389,7 +389,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -442,7 +442,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -495,7 +495,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index 99beeed313e..e6bb45c4a31 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -99,7 +99,7 @@ BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type return (source >> bit_start) & MASK; } -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() /** * Bitfield-extract for 128-bit types. */ diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index 4d1db99a821..a89cd159309 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -76,17 +76,8 @@ _CCCL_DIAG_POP CUB_NAMESPACE_BEGIN #ifndef CUB_IS_INT128_ENABLED -# if defined(__CUDACC_RTC__) -# if defined(__CUDACC_RTC_INT128__) -# define CUB_IS_INT128_ENABLED 1 -# endif // !defined(__CUDACC_RTC_INT128__) -# else // !defined(__CUDACC_RTC__) -# if _CCCL_CUDACC_AT_LEAST(11, 5) -# if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) -# define CUB_IS_INT128_ENABLED 1 -# endif // GCC || CLANG || NVHPC -# endif // _CCCL_CUDACC_AT_LEAST(11, 5) -# endif // !defined(__CUDACC_RTC__) +// Deprecated [Since 2.8] +# define CUB_IS_INT128_ENABLED _CCCL_HAS_INT128() #endif // !defined(CUB_IS_INT128_ENABLED) /****************************************************************************** diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu index 3e5a6c6689a..313b9e58b38 100644 --- a/cub/test/catch2_test_device_for_each_in_extents.cu +++ b/cub/test/catch2_test_device_for_each_in_extents.cu @@ -107,7 +107,7 @@ using index_types = uint16_t, int32_t, uint32_t -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() , int64_t, uint64_t @@ -120,7 +120,7 @@ using index_types_dynamic = uint16_t, int32_t, uint32_t -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() , int64_t, uint64_t diff --git a/cub/test/catch2_test_printing.cu b/cub/test/catch2_test_printing.cu index 6f93515114a..63b622f3554 100644 --- a/cub/test/catch2_test_printing.cu +++ b/cub/test/catch2_test_printing.cu @@ -11,7 +11,7 @@ std::string print(T val) return ss.str(); } -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() TEST_CASE("Test utils can print __int128", "[test][utils]") { REQUIRE(print(__int128_t{0}) == "0"); diff --git a/cub/test/internal/catch2_test_fast_div_mod.cu b/cub/test/internal/catch2_test_fast_div_mod.cu index 8a1a3e96a27..ec3b5e20d68 100644 --- a/cub/test/internal/catch2_test_fast_div_mod.cu +++ b/cub/test/internal/catch2_test_fast_div_mod.cu @@ -42,7 +42,7 @@ using index_types = uint16_t, int32_t, uint32_t -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() , int64_t, uint64_t diff --git a/cub/test/test_util.h b/cub/test/test_util.h index 031298120dc..9a5fefcc69c 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -717,7 +717,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< return os; } -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() inline std::ostream& operator<<(std::ostream& os, __uint128_t val) { constexpr int max_digits = 40; From c02e845e7f40dc748777638ce70e9893560e473c Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 30 Jan 2025 07:39:35 +0100 Subject: [PATCH 12/33] Adds support for large num items to `DeviceMerge` (#3530) * adds support for large num items * re-enable vsmem tests * rephrases test description --- cub/cub/device/device_merge.cuh | 18 ++-- cub/test/catch2_test_device_merge.cu | 129 +++++---------------------- 2 files changed, 33 insertions(+), 114 deletions(-) diff --git a/cub/cub/device/device_merge.cuh b/cub/cub/device/device_merge.cuh index 7135546a0e6..814bad75248 100644 --- a/cub/cub/device/device_merge.cuh +++ b/cub/cub/device/device_merge.cuh @@ -76,16 +76,19 @@ struct DeviceMerge void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, - int num_keys1, + ::cuda::std::int64_t num_keys1, KeyIteratorIn2 keys_in2, - int num_keys2, + ::cuda::std::int64_t num_keys2, KeyIteratorOut keys_out, CompareOp compare_op = {}, cudaStream_t stream = nullptr) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys"); + + using offset_t = ::cuda::std::int64_t; + return detail::merge:: - dispatch_t:: + dispatch_t:: dispatch( d_temp_storage, temp_storage_bytes, @@ -161,16 +164,19 @@ struct DeviceMerge std::size_t& temp_storage_bytes, KeyIteratorIn1 keys_in1, ValueIteratorIn1 values_in1, - int num_pairs1, + ::cuda::std::int64_t num_pairs1, KeyIteratorIn2 keys_in2, ValueIteratorIn2 values_in2, - int num_pairs2, + ::cuda::std::int64_t num_pairs2, KeyIteratorOut keys_out, ValueIteratorOut values_out, CompareOp compare_op = {}, cudaStream_t stream = nullptr) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs"); + + using offset_t = ::cuda::std::int64_t; + return detail::merge::dispatch_t< KeyIteratorIn1, ValueIteratorIn1, @@ -178,7 +184,7 @@ struct DeviceMerge ValueIteratorIn2, KeyIteratorOut, ValueIteratorOut, - int, + offset_t, CompareOp>::dispatch(d_temp_storage, temp_storage_bytes, keys_in1, diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu index ae0d3f84baa..4835f597710 100644 --- a/cub/test/catch2_test_device_merge.cu +++ b/cub/test/catch2_test_device_merge.cu @@ -20,103 +20,8 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs); DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys); -// TODO(bgruber): replace the following by the CUB device API directly, once we have figured out how to handle different -// offset types -namespace detail -{ -template > -CUB_RUNTIME_FUNCTION static cudaError_t merge_keys_custom_offset_type( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorIn1 keys_in1, - Offset num_keys1, - KeyIteratorIn2 keys_in2, - Offset num_keys2, - KeyIteratorOut keys_out, - CompareOp compare_op = {}, - cudaStream_t stream = 0) -{ - CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys"); - return cub::detail::merge::dispatch_t< - KeyIteratorIn1, - cub::NullType*, - KeyIteratorIn2, - cub::NullType*, - KeyIteratorOut, - cub::NullType*, - Offset, - CompareOp>::dispatch(d_temp_storage, - temp_storage_bytes, - keys_in1, - nullptr, - num_keys1, - keys_in2, - nullptr, - num_keys2, - keys_out, - nullptr, - compare_op, - stream); -} - -template > -CUB_RUNTIME_FUNCTION static cudaError_t merge_pairs_custom_offset_type( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorIn1 keys_in1, - ValueIteratorIn1 values_in1, - Offset num_pairs1, - KeyIteratorIn2 keys_in2, - ValueIteratorIn2 values_in2, - Offset num_pairs2, - KeyIteratorOut keys_out, - ValueIteratorOut values_out, - CompareOp compare_op = {}, - cudaStream_t stream = 0) -{ - CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs"); - return cub::detail::merge::dispatch_t< - KeyIteratorIn1, - ValueIteratorIn1, - KeyIteratorIn2, - ValueIteratorIn2, - KeyIteratorOut, - ValueIteratorOut, - Offset, - CompareOp>::dispatch(d_temp_storage, - temp_storage_bytes, - keys_in1, - values_in1, - num_pairs1, - keys_in2, - values_in2, - num_pairs2, - keys_out, - values_out, - compare_op, - stream); -} -} // namespace detail - -DECLARE_LAUNCH_WRAPPER(detail::merge_keys_custom_offset_type, merge_keys_custom_offset_type); -DECLARE_LAUNCH_WRAPPER(detail::merge_pairs_custom_offset_type, merge_pairs_custom_offset_type); - using types = c2h::type_list; -// gevtushenko: there is no code path in CUB and Thrust that leads to unsigned offsets, so let's safe some compile time -using offset_types = c2h::type_list; - template , @@ -223,11 +128,27 @@ C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_ }); } -C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]", offset_types) +C2H_TEST("DeviceMerge::MergeKeys works for large number of items", "[merge][device]") + +try +{ + using key_t = char; + using offset_t = int64_t; + + // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items + const auto num_items_int_max = static_cast(::cuda::std::numeric_limits::max()); + + // Generate the input sizes to test for + const offset_t num_items_lhs = + GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max - 1, offset_t{3}})); + const offset_t num_items_rhs = + GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max, offset_t{3}})); + + test_keys(num_items_lhs, num_items_rhs, ::cuda::std::less<>{}); +} +catch (const std::bad_alloc&) { - using key_t = int; - using offset_t = c2h::get<0, TestType>; - test_keys(3623, 6346, ::cuda::std::less<>{}, merge_keys_custom_offset_type); + // allocation failure is not a test failure, so we can run tests on smaller GPUs } C2H_TEST("DeviceMerge::MergeKeys input sizes", "[merge][device]") @@ -385,14 +306,6 @@ C2H_TEST("DeviceMerge::MergePairs value types", "[merge][device]", types) test_pairs(); } -C2H_TEST("DeviceMerge::MergePairs offset types", "[merge][device]", offset_types) -{ - using key_t = int; - using value_t = int; - using offset_t = c2h::get<0, TestType>; - test_pairs(3623, 6346, ::cuda::std::less<>{}, merge_pairs_custom_offset_type); -} - C2H_TEST("DeviceMerge::MergePairs input sizes", "[merge][device]") { using key_t = int; @@ -410,7 +323,7 @@ try using key_t = char; using value_t = char; const auto size = std::int64_t{1} << GENERATE(30, 31, 32, 33); - test_pairs(size, size, ::cuda::std::less<>{}, merge_pairs_custom_offset_type); + test_pairs(size, size, ::cuda::std::less<>{}); } catch (const std::bad_alloc&) { From a654bc6e0fec3937ddd597dc44adaec61a40701f Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 08:33:22 +0100 Subject: [PATCH 13/33] Support FP16 traits on CTK 12.0 (#3535) * Support FP16 traits on CTK 12.0 * Only enable constexpr limits when supported * Support float_eq on CTK < 12.2 --- .../is_extended_floating_point.h | 16 ++--- libcudacxx/include/cuda/std/limits | 58 ++++++++++++------- .../meta.unary.cat/is_floating_point.pass.cpp | 8 +-- .../limits/is_specialized.pass.cpp | 8 +-- .../limits/numeric.limits.members/common.h | 25 ++++++-- .../const_data_members.pass.cpp | 8 +-- .../denorm_min.pass.cpp | 8 +-- .../numeric.limits.members/digits.pass.cpp | 8 +-- .../numeric.limits.members/digits10.pass.cpp | 8 +-- .../numeric.limits.members/epsilon.pass.cpp | 8 +-- .../has_denorm.pass.cpp | 8 +-- .../has_denorm_loss.pass.cpp | 8 +-- .../has_infinity.pass.cpp | 8 +-- .../has_quiet_NaN.pass.cpp | 8 +-- .../has_signaling_NaN.pass.cpp | 8 +-- .../numeric.limits.members/infinity.pass.cpp | 16 ++--- .../is_bounded.pass.cpp | 8 +-- .../numeric.limits.members/is_exact.pass.cpp | 8 +-- .../numeric.limits.members/is_iec559.pass.cpp | 8 +-- .../is_integer.pass.cpp | 8 +-- .../numeric.limits.members/is_modulo.pass.cpp | 8 +-- .../numeric.limits.members/is_signed.pass.cpp | 8 +-- .../numeric.limits.members/lowest.pass.cpp | 8 +-- .../numeric.limits.members/max.pass.cpp | 8 +-- .../max_digits10.pass.cpp | 8 +-- .../max_exponent.pass.cpp | 8 +-- .../max_exponent10.pass.cpp | 8 +-- .../numeric.limits.members/min.pass.cpp | 8 +-- .../min_exponent.pass.cpp | 8 +-- .../min_exponent10.pass.cpp | 8 +-- .../numeric.limits.members/quiet_NaN.pass.cpp | 8 +-- .../numeric.limits.members/radix.pass.cpp | 8 +-- .../round_error.pass.cpp | 8 +-- .../round_style.pass.cpp | 8 +-- .../signaling_NaN.pass.cpp | 8 +-- .../tinyness_before.pass.cpp | 8 +-- .../numeric.limits.members/traps.pass.cpp | 8 +-- 37 files changed, 205 insertions(+), 174 deletions(-) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h index b9700a87066..040418f5fe7 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h @@ -22,16 +22,16 @@ #include -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) # include -#endif // _LIBCUDACXX_HAS_NVFP16 +#endif // _CCCL_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#if defined(_CCCL_HAS_NVBF16) _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") # include _CCCL_DIAG_POP -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() # include @@ -53,7 +53,7 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v # endif // !_CCCL_NO_INLINE_VARIABLES #endif // !_CCCL_NO_VARIABLE_TEMPLATES -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) template <> struct __is_extended_floating_point<__half> : true_type {}; @@ -62,9 +62,9 @@ struct __is_extended_floating_point<__half> : true_type template <> _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__half> = true; # endif // !_CCCL_NO_INLINE_VARIABLES -#endif // _LIBCUDACXX_HAS_NVFP16 +#endif // _CCCL_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#if defined(_CCCL_HAS_NVBF16) template <> struct __is_extended_floating_point<__nv_bfloat16> : true_type {}; @@ -73,7 +73,7 @@ struct __is_extended_floating_point<__nv_bfloat16> : true_type template <> _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__nv_bfloat16> = true; # endif // !_CCCL_NO_INLINE_VARIABLES -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() template <> diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits index 9d0cbc81108..29f4bf24ec3 100644 --- a/libcudacxx/include/cuda/std/limits +++ b/libcudacxx/include/cuda/std/limits @@ -608,7 +608,13 @@ public: #endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE }; -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) +# ifdef _LIBCUDACXX_HAS_NVFP16 +# define _LIBCUDACXX_FP16_CONSTEXPR constexpr +# else //_LIBCUDACXX_HAS_NVFP16 +# define _LIBCUDACXX_FP16_CONSTEXPR +# endif //_LIBCUDACXX_HAS_NVFP16 + template <> class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point> { @@ -621,15 +627,15 @@ public: static constexpr int digits = 11; static constexpr int digits10 = 3; static constexpr int max_digits10 = 5; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type min() noexcept { return type(__half_raw{0x0400u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type max() noexcept { return type(__half_raw{0x7bffu}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type lowest() noexcept { return type(__half_raw{0xfbffu}); } @@ -637,11 +643,11 @@ public: static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr int radix = __FLT_RADIX__; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type epsilon() noexcept { return type(__half_raw{0x1400u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type round_error() noexcept { return type(__half_raw{0x3800u}); } @@ -656,19 +662,19 @@ public: static constexpr bool has_signaling_NaN = true; static constexpr float_denorm_style has_denorm = denorm_present; static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type infinity() noexcept { return type(__half_raw{0x7c00u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type quiet_NaN() noexcept { return type(__half_raw{0x7e00u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type signaling_NaN() noexcept { return type(__half_raw{0x7d00u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type denorm_min() noexcept { return type(__half_raw{0x0001u}); } @@ -681,9 +687,16 @@ public: static constexpr bool tinyness_before = false; static constexpr float_round_style round_style = round_to_nearest; }; -#endif // _LIBCUDACXX_HAS_NVFP16 +# undef _LIBCUDACXX_FP16_CONSTEXPR +#endif // _CCCL_HAS_NVFP16 + +#if defined(_CCCL_HAS_NVBF16) +# ifdef _LIBCUDACXX_HAS_NVBF16 +# define _LIBCUDACXX_BF16_CONSTEXPR constexpr +# else //_LIBCUDACXX_HAS_NVBF16 +# define _LIBCUDACXX_BF16_CONSTEXPR +# endif //_LIBCUDACXX_HAS_NVBF16 -#if defined(_LIBCUDACXX_HAS_NVBF16) template <> class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point> { @@ -696,15 +709,15 @@ public: static constexpr int digits = 8; static constexpr int digits10 = 2; static constexpr int max_digits10 = 4; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type min() noexcept { return type(__nv_bfloat16_raw{0x0080u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type max() noexcept { return type(__nv_bfloat16_raw{0x7f7fu}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type lowest() noexcept { return type(__nv_bfloat16_raw{0xff7fu}); } @@ -712,11 +725,11 @@ public: static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr int radix = __FLT_RADIX__; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type epsilon() noexcept { return type(__nv_bfloat16_raw{0x3c00u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type round_error() noexcept { return type(__nv_bfloat16_raw{0x3f00u}); } @@ -731,19 +744,19 @@ public: static constexpr bool has_signaling_NaN = true; static constexpr float_denorm_style has_denorm = denorm_present; static constexpr bool has_denorm_loss = false; - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type infinity() noexcept { return type(__nv_bfloat16_raw{0x7f80u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type quiet_NaN() noexcept { return type(__nv_bfloat16_raw{0x7fc0u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type signaling_NaN() noexcept { return type(__nv_bfloat16_raw{0x7fa0u}); } - _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept + _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type denorm_min() noexcept { return type(__nv_bfloat16_raw{0x0001u}); } @@ -756,7 +769,8 @@ public: static constexpr bool tinyness_before = false; static constexpr float_round_style round_style = round_to_nearest; }; -#endif // _LIBCUDACXX_HAS_NVBF16 +# undef _LIBCUDACXX_BF16_CONSTEXPR +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() # if defined(_CCCL_BUILTIN_BIT_CAST) || _CCCL_STD_VER >= 2014 diff --git a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp index b0b7a3f3b69..5a04070c598 100644 --- a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp +++ b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp @@ -80,12 +80,12 @@ int main(int, char**) test_is_floating_point(); test_is_floating_point(); test_is_floating_point(); -#ifdef _LIBCUDACXX_HAS_NVFP16 +#ifdef _CCCL_HAS_NVFP16 test_is_floating_point<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#ifdef _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVFP16 +#ifdef _CCCL_HAS_NVBF16 test_is_floating_point<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test_is_floating_point<__nv_fp8_e4m3>(); test_is_floating_point<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp index 7113c0e2772..adb30091033 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp @@ -68,12 +68,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 static_assert(!cuda::std::numeric_limits>::is_specialized, "!cuda::std::numeric_limits >::is_specialized"); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h index 8400071611c..7d15f2ba6b6 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h @@ -17,6 +17,7 @@ #define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1 #define __CUDA_NO_BFLOAT16_OPERATORS__ 1 +#include #include template @@ -42,27 +43,43 @@ __host__ __device__ inline __nv_fp8_e5m2 make_fp8_e5m2(double x, __nv_saturation __host__ __device__ inline bool float_eq(__nv_fp8_e4m3 x, __nv_fp8_e4m3 y) { +# if _CCCL_CUDACC_AT_LEAST(12, 2) return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E4M3)}); +# else + return ::cuda::std::bit_cast(x) == ::cuda::std::bit_cast(y); +# endif } __host__ __device__ inline bool float_eq(__nv_fp8_e5m2 x, __nv_fp8_e5m2 y) { +# if _CCCL_CUDACC_AT_LEAST(12, 2) return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E5M2)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E5M2)}); +# else + return ::cuda::std::bit_cast(x) == ::cuda::std::bit_cast(y); +# endif } #endif // _CCCL_HAS_NVFP8 -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) __host__ __device__ inline bool float_eq(__half x, __half y) { +# if _CCCL_CUDACC_AT_LEAST(12, 2) return __heq(x, y); +# else + return __half2float(x) == __half2float(y); +# endif } -#endif // _LIBCUDACXX_HAS_NVFP16 +#endif // _CCCL_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#if defined(_CCCL_HAS_NVBF16) __host__ __device__ inline bool float_eq(__nv_bfloat16 x, __nv_bfloat16 y) { +# if _CCCL_CUDACC_AT_LEAST(12, 2) return __heq(x, y); +# else + return __bfloat162float(x) == __bfloat162float(y); +# endif } -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #endif // NUMERIC_LIMITS_MEMBERS_COMMON_H diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp index b095d63afcd..093b5d331be 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp @@ -110,12 +110,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test_type(); #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test_type<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test_type<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test_type<__nv_fp8_e4m3>(); test_type<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp index 475f41a3388..9ea232eaad6 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp @@ -66,12 +66,12 @@ int main(int, char**) test(LDBL_TRUE_MIN); # endif #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(5.9604644775390625e-08)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(9.18354961579912115600575419705e-41)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(0.001953125)); test<__nv_fp8_e5m2>(make_fp8_e5m2(0.0000152587890625)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp index 0d3c910b672..01f6b05543b 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp @@ -55,12 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, 11>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, 8>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, 3>(); test<__nv_fp8_e5m2, 2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp index bd66aeecfeb..24c53725738 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp @@ -74,12 +74,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(); test<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp index 15366bdf308..bb65847df33 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp @@ -57,12 +57,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(LDBL_EPSILON); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(0.0009765625)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(0.0078125)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(0.125)); test<__nv_fp8_e5m2>(make_fp8_e5m2(0.25)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp index 8fa506b93ce..8d9881580bf 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, cuda::std::denorm_present>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, cuda::std::denorm_present>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, cuda::std::denorm_present>(); test<__nv_fp8_e5m2, cuda::std::denorm_present>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp index 3b7722acd8b..5a046a9b339 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp index ebddcb4421e..768e53d1c88 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp index 908f2d7fa4a..4c3e11a9b05 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, true>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp index 62d81c8a524..1b80d1869e6 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp index 627105a4a8c..8dd611556c5 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp @@ -64,12 +64,12 @@ int main(int, char**) # ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(1. / 0.); # endif -# if defined(_LIBCUDACXX_HAS_NVFP16) +# if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(1.0 / 0.0)); -# endif // _LIBCUDACXX_HAS_NVFP16 -# if defined(_LIBCUDACXX_HAS_NVBF16) +# endif // _CCCL_HAS_NVFP16 +# if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(1.0 / 0.0)); -# endif // _LIBCUDACXX_HAS_NVBF16 +# endif // _CCCL_HAS_NVBF16 # if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(__nv_fp8_e4m3{}); test<__nv_fp8_e5m2>(make_fp8_e5m2(1.0 / 0.0)); @@ -81,12 +81,12 @@ int main(int, char**) # ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(INFINITY); # endif -# if defined(_LIBCUDACXX_HAS_NVFP16) +# if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(INFINITY)); -# endif // _LIBCUDACXX_HAS_NVFP16 -# if defined(_LIBCUDACXX_HAS_NVBF16) +# endif // _CCCL_HAS_NVFP16 +# if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(INFINITY)); -# endif // _LIBCUDACXX_HAS_NVBF16 +# endif // _CCCL_HAS_NVBF16 # if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(__nv_fp8_e4m3{}); test<__nv_fp8_e5m2>(make_fp8_e5m2(INFINITY)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp index eeb9740e4e2..e28ab8313b6 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, true>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp index c3c2e027c72..e6038f1589b 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp index 7bab40e8826..1ff809bad09 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp index 68e7437f1e0..eed9d38c050 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp index 992be2b18b7..fc3ca9dbb4e 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp index be7e4f235a7..54005f6c0b9 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, true>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, true>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, true>(); test<__nv_fp8_e5m2, true>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp index 6a8b2a9c181..72190bd2ad7 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp @@ -66,12 +66,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(-LDBL_MAX); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(-65504.0)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(-3.3895313892515355e+38)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(-448.0)); test<__nv_fp8_e5m2>(make_fp8_e5m2(-57344.0)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp index a1582e41b22..5039f773a2f 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp @@ -65,12 +65,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(LDBL_MAX); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(65504.0)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(3.3895313892515355e+38)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(448.0)); test<__nv_fp8_e5m2>(make_fp8_e5m2(57344.0)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp index d01a4aa099c..309279bc79c 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp @@ -69,12 +69,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(); test<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp index 3027e9f06f5..606e9c52b7f 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp @@ -62,12 +62,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, 16>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, 128>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, 8>(); test<__nv_fp8_e5m2, 15>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp index 5924aee173d..61145deec86 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp @@ -62,12 +62,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, 4>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, 38>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, 2>(); test<__nv_fp8_e5m2, 4>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp index 15f470909df..ccab08a38f5 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp @@ -66,12 +66,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(LDBL_MIN); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(6.103515625e-05)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(1.17549435082228750796873653722e-38)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(0.015625)); test<__nv_fp8_e5m2>(make_fp8_e5m2(0.000061035)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp index b63d653a7c3..c942a6288be 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp @@ -62,12 +62,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, -13>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, -125>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, -6>(); test<__nv_fp8_e5m2, -15>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp index a6ff20e7fde..e9b6f29d25f 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp @@ -62,12 +62,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, -4>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, -37>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, -2>(); test<__nv_fp8_e5m2, -5>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp index 2d6d9582f5c..a8b076fbeee 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp @@ -108,12 +108,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(); test<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp index 7e5c87927aa..dd15c391180 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp @@ -55,12 +55,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, FLT_RADIX>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, FLT_RADIX>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, FLT_RADIX>(); test<__nv_fp8_e5m2, FLT_RADIX>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp index d4faf373a09..95ed80eb951 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp @@ -57,12 +57,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(0.5); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(__double2half(0.5)); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(__double2bfloat16(0.5)); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(make_fp8_e4m3(0.5)); test<__nv_fp8_e5m2>(make_fp8_e5m2(0.5)); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp index 8515581d650..1eb5c0b0f5a 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, cuda::std::round_to_nearest>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, cuda::std::round_to_nearest>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, cuda::std::round_to_nearest>(); test<__nv_fp8_e5m2, cuda::std::round_to_nearest>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp index 19ace1b3d2c..0ec70976b32 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp @@ -108,12 +108,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3>(); test<__nv_fp8_e5m2>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp index 38dec8c872b..1da28874b06 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp @@ -54,12 +54,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp index 55d7eb990db..4cb627a4b77 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp @@ -60,12 +60,12 @@ int main(int, char**) #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE test(); #endif -#if defined(_LIBCUDACXX_HAS_NVFP16) +#if defined(_CCCL_HAS_NVFP16) test<__half, false>(); -#endif // _LIBCUDACXX_HAS_NVFP16 -#if defined(_LIBCUDACXX_HAS_NVBF16) +#endif // _CCCL_HAS_NVFP16 +#if defined(_CCCL_HAS_NVBF16) test<__nv_bfloat16, false>(); -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // _CCCL_HAS_NVBF16 #if _CCCL_HAS_NVFP8() test<__nv_fp8_e4m3, false>(); test<__nv_fp8_e5m2, false>(); From b6209e841a72eb7def4ba2aace30eff8a9b539a4 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 30 Jan 2025 09:06:31 +0100 Subject: [PATCH 14/33] Suppress execution checks for vocabulary types (#3578) * Suppress execution checks for optional * Suppress execution checks for `expected` * Suppress execution checks for `pair` * Suppress execution checks for `variant` --- .../cuda/std/__expected/bad_expected_access.h | 21 +- .../include/cuda/std/__expected/expected.h | 20 ++ .../cuda/std/__expected/expected_base.h | 18 ++ .../include/cuda/std/__expected/unexpected.h | 7 + .../include/cuda/std/__memory/construct_at.h | 1 + libcudacxx/include/cuda/std/__utility/pair.h | 19 +- .../cuda/std/detail/libcxx/include/optional | 25 +++ .../cuda/std/detail/libcxx/include/tuple | 2 + .../cuda/std/detail/libcxx/include/variant | 20 ++ .../expected/device_only_types.pass.cpp | 201 ++++++++++++++++++ .../expected/host_only_types.pass.cpp | 199 +++++++++++++++++ .../optional/device_only_types.pass.cpp | 136 ++++++++++++ .../optional/host_only_types.pass.cpp | 134 ++++++++++++ .../tuple/device_only_types.pass.cpp | 81 +++++++ .../tuple/forward_as_tuple_interop.pass.cpp | 0 .../utilities/tuple/host_only_types.pass.cpp | 90 ++++++++ .../tuple/vector_types_get.pass.cpp | 0 .../vector_types_structured_bindings.pass.cpp | 0 .../tuple/vector_types_tuple_element.pass.cpp | 0 .../tuple/vector_types_tuple_size.pass.cpp | 0 .../unexpected/device_only_types.pass.cpp | 82 +++++++ .../unexpected/host_only_types.pass.cpp | 85 ++++++++ .../utility/pair/device_only_types.pass.cpp | 93 ++++++++ .../utility/pair/host_only_types.pass.cpp | 93 ++++++++ .../pair/interop}/pair.assign.pass.cpp | 0 .../utility/pair/interop}/pair.cons.pass.cpp | 0 .../utility/pair/interop}/pair.conv.pass.cpp | 0 .../variant/device_only_types.pass.cpp | 120 +++++++++++ .../variant/host_only_types.pass.cpp | 129 +++++++++++ libcudacxx/test/support/host_device_types.h | 148 +++++++++++++ 30 files changed, 1714 insertions(+), 10 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/forward_as_tuple_interop.pass.cpp (100%) create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_get.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_structured_bindings.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_tuple_element.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_tuple_size.pass.cpp (100%) create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp rename libcudacxx/test/libcudacxx/cuda/{pair_interop => utilities/utility/pair/interop}/pair.assign.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{pair_interop => utilities/utility/pair/interop}/pair.cons.pass.cpp (100%) rename libcudacxx/test/libcudacxx/cuda/{pair_interop => utilities/utility/pair/interop}/pair.conv.pass.cpp (100%) create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp create mode 100644 libcudacxx/test/support/host_device_types.h diff --git a/libcudacxx/include/cuda/std/__expected/bad_expected_access.h b/libcudacxx/include/cuda/std/__expected/bad_expected_access.h index 5600402e429..0f10f546be6 100644 --- a/libcudacxx/include/cuda/std/__expected/bad_expected_access.h +++ b/libcudacxx/include/cuda/std/__expected/bad_expected_access.h @@ -51,14 +51,6 @@ class bad_expected_access; template <> class bad_expected_access : public ::std::exception { -protected: - _CCCL_HIDE_FROM_ABI bad_expected_access() noexcept = default; - _CCCL_HIDE_FROM_ABI bad_expected_access(const bad_expected_access&) = default; - _CCCL_HIDE_FROM_ABI bad_expected_access(bad_expected_access&&) = default; - _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(const bad_expected_access&) = default; - _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(bad_expected_access&&) = default; - ~bad_expected_access() noexcept override = default; - public: // The way this has been designed (by using a class template below) means that we'll already // have a profusion of these vtables in TUs, and the dynamic linker will already have a bunch @@ -74,10 +66,21 @@ template class bad_expected_access : public bad_expected_access { public: - explicit bad_expected_access(_Err __e) +# if _CCCL_CUDA_COMPILER(CLANG) // Clang needs this or it breaks with device only types + _CCCL_HOST_DEVICE +# endif // _CCCL_CUDA_COMPILER(CLANG) + _CCCL_HIDE_FROM_ABI explicit bad_expected_access(_Err __e) : __unex_(_CUDA_VSTD::move(__e)) {} +# if _CCCL_CUDA_COMPILER(CLANG) // Clang needs this or it breaks with device only types + _CCCL_HOST_DEVICE +# endif // _CCCL_CUDA_COMPILER(CLANG) + _CCCL_HIDE_FROM_ABI ~bad_expected_access() noexcept + { + __unex_.~_Err(); + } + _LIBCUDACXX_HIDE_FROM_ABI _Err& error() & noexcept { return __unex_; diff --git a/libcudacxx/include/cuda/std/__expected/expected.h b/libcudacxx/include/cuda/std/__expected/expected.h index cc5ddfc03f0..f618ff57c92 100644 --- a/libcudacxx/include/cuda/std/__expected/expected.h +++ b/libcudacxx/include/cuda/std/__expected/expected.h @@ -1070,6 +1070,7 @@ class expected : private __expected_move_assign<_Tp, _Err> } // [expected.object.eq], equality operators + _CCCL_EXEC_CHECK_DISABLE friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y) { if (__x.__has_val_ != __y.has_value()) @@ -1090,12 +1091,14 @@ class expected : private __expected_move_assign<_Tp, _Err> } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y) { return !(__x == __y); } # endif // _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2, class _E2) _CCCL_REQUIRES((!_CCCL_TRAIT(is_void, _T2))) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y) @@ -1118,6 +1121,7 @@ class expected : private __expected_move_assign<_Tp, _Err> } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2, class _E2) _CCCL_REQUIRES((!_CCCL_TRAIT(is_void, _T2))) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected<_T2, _E2>& __y) @@ -1126,6 +1130,7 @@ class expected : private __expected_move_assign<_Tp, _Err> } # endif // _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2) _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) ) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const _T2& __v) @@ -1133,18 +1138,21 @@ class expected : private __expected_move_assign<_Tp, _Err> return __x.__has_val_ && static_cast(__x.__union_.__val_ == __v); } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2) _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) ) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const _T2& __v, const expected& __x) { return __x.__has_val_ && static_cast(__x.__union_.__val_ == __v); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2) _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) ) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const _T2& __v) { return !__x.__has_val_ || static_cast(__x.__union_.__val_ != __v); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T2) _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) ) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const _T2& __v, const expected& __x) @@ -1153,22 +1161,26 @@ class expected : private __expected_move_assign<_Tp, _Err> } # endif // _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __e) { return !__x.__has_val_ && static_cast(__x.__union_.__unex_ == __e.error()); } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __e, const expected& __x) { return !__x.__has_val_ && static_cast(__x.__union_.__unex_ == __e.error()); } + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __e) { return __x.__has_val_ || static_cast(__x.__union_.__unex_ != __e.error()); } + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const unexpected<_E2>& __e, const expected& __x) { @@ -1906,6 +1918,7 @@ class expected : private __expected_move_assign } // [expected.void.eq], equality operators + _CCCL_EXEC_CHECK_DISABLE friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y) noexcept { if (__x.__has_val_ != __y.has_value()) @@ -1918,12 +1931,14 @@ class expected : private __expected_move_assign } } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y) noexcept { return !(__x == __y); } # endif + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y) noexcept @@ -1938,6 +1953,7 @@ class expected : private __expected_move_assign } } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y) noexcept @@ -1946,22 +1962,26 @@ class expected : private __expected_move_assign } # endif + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __y) noexcept { return !__x.__has_val_ && static_cast(__x.__union_.__unex_ == __y.error()); } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __y, const expected& __x) noexcept { return !__x.__has_val_ && static_cast(__x.__union_.__unex_ == __y.error()); } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __y) noexcept { return __x.__has_val_ || static_cast(__x.__union_.__unex_ != __y.error()); } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const unexpected<_E2>& __y, const expected& __x) noexcept { diff --git a/libcudacxx/include/cuda/std/__expected/expected_base.h b/libcudacxx/include/cuda/std/__expected/expected_base.h index 31de97e3f50..0de6cc29158 100644 --- a/libcudacxx/include/cuda/std/__expected/expected_base.h +++ b/libcudacxx/include/cuda/std/__expected/expected_base.h @@ -71,30 +71,35 @@ union __expected_union_t struct __empty_t {}; + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Tp2 = _Tp) _CCCL_REQUIRES(_CCCL_TRAIT(is_default_constructible, _Tp2)) _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2)) : __val_() {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Tp2 = _Tp) _CCCL_REQUIRES((!_CCCL_TRAIT(is_default_constructible, _Tp2))) _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept : __empty_() {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...)) : __val_(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...)) : __unex_(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t( __expected_construct_from_invoke_tag, @@ -104,6 +109,7 @@ union __expected_union_t : __val_(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fun>(__fun), _CUDA_VSTD::forward<_Args>(__args)...)) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t( __expected_construct_from_invoke_tag, @@ -128,18 +134,21 @@ union __expected_union_t<_Tp, _Err, true> struct __empty_t {}; + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Tp2 = _Tp) _CCCL_REQUIRES(_CCCL_TRAIT(is_default_constructible, _Tp2)) _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2)) : __val_() {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Tp2 = _Tp) _CCCL_REQUIRES((!_CCCL_TRAIT(is_default_constructible, _Tp2))) _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept : __empty_() {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...)) @@ -152,6 +161,7 @@ union __expected_union_t<_Tp, _Err, true> : __unex_(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t( __expected_construct_from_invoke_tag, @@ -161,6 +171,7 @@ union __expected_union_t<_Tp, _Err, true> : __val_(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fun>(__fun), _CUDA_VSTD::forward<_Args>(__args)...)) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t( __expected_construct_from_invoke_tag, @@ -436,6 +447,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> { _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_storage, __expected_destruct, _Tp, _Err); + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T1, class _T2, class... _Args) _CCCL_REQUIRES(_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void @@ -445,6 +457,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> _LIBCUDACXX_CONSTRUCT_AT(__newval, _CUDA_VSTD::forward<_Args>(__args)...); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T1, class _T2, class... _Args) _CCCL_REQUIRES( (!_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) _CCCL_AND _CCCL_TRAIT(is_nothrow_move_constructible, _T1)) @@ -456,6 +469,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> _LIBCUDACXX_CONSTRUCT_AT(__newval, _CUDA_VSTD::move(__tmp)); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _T1, class _T2, class... _Args) _CCCL_REQUIRES( (!_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) _CCCL_AND(!_CCCL_TRAIT(is_nothrow_move_constructible, _T1))) @@ -475,6 +489,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> __trans.__complete(); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Err2 = _Err) _CCCL_REQUIRES(_CCCL_TRAIT(is_nothrow_move_constructible, _Err2)) static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void @@ -493,6 +508,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err> __with_err.__has_val_ = true; } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Err2 = _Err) _CCCL_REQUIRES((!_CCCL_TRAIT(is_nothrow_move_constructible, _Err2))) static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void @@ -653,6 +669,7 @@ struct __expected_copy_assign<_Tp, _Err, __smf_availability::__available> : __ex _CCCL_HIDE_FROM_ABI __expected_copy_assign(const __expected_copy_assign&) = default; _CCCL_HIDE_FROM_ABI __expected_copy_assign(__expected_copy_assign&&) = default; + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __expected_copy_assign& operator=(const __expected_copy_assign& __other) noexcept( _CCCL_TRAIT(is_nothrow_copy_assignable, _Tp) && _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp) @@ -917,6 +934,7 @@ struct __expected_storage : __expected_destruct { _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_storage, __expected_destruct, void, _Err); + _CCCL_EXEC_CHECK_DISABLE static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __swap_val_unex_impl( __expected_storage& __with_val, __expected_storage& __with_err) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Err)) diff --git a/libcudacxx/include/cuda/std/__expected/unexpected.h b/libcudacxx/include/cuda/std/__expected/unexpected.h index 0f8f3784374..0da94402a85 100644 --- a/libcudacxx/include/cuda/std/__expected/unexpected.h +++ b/libcudacxx/include/cuda/std/__expected/unexpected.h @@ -73,6 +73,7 @@ class unexpected _CCCL_HIDE_FROM_ABI unexpected(const unexpected&) = default; _CCCL_HIDE_FROM_ABI unexpected(unexpected&&) = default; + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Error = _Err) _CCCL_REQUIRES((!_CCCL_TRAIT(is_same, remove_cvref_t<_Error>, unexpected) && !_CCCL_TRAIT(is_same, remove_cvref_t<_Error>, in_place_t) @@ -82,6 +83,7 @@ class unexpected : __unex_(_CUDA_VSTD::forward<_Error>(__error)) {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class... _Args) _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _Args...)) _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit unexpected(in_place_t, _Args&&... __args) noexcept( @@ -89,6 +91,7 @@ class unexpected : __unex_(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Up, class... _Args) _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, initializer_list<_Up>&, _Args...)) _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit unexpected( @@ -123,6 +126,7 @@ class unexpected } // [expected.un.swap] + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(unexpected& __other) noexcept(_CCCL_TRAIT(is_nothrow_swappable, _Err)) { static_assert(_CCCL_TRAIT(is_swappable, _Err), "E must be swappable"); @@ -130,6 +134,7 @@ class unexpected swap(__unex_, __other.__unex_); } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Err2 = _Err) _CCCL_REQUIRES(_CCCL_TRAIT(is_swappable, _Err2)) friend _LIBCUDACXX_HIDE_FROM_ABI constexpr void @@ -140,6 +145,7 @@ class unexpected } // [expected.un.eq] + _CCCL_EXEC_CHECK_DISABLE template _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected& __lhs, @@ -148,6 +154,7 @@ class unexpected return __lhs.error() == __rhs.error(); } # if _CCCL_STD_VER < 2020 + _CCCL_EXEC_CHECK_DISABLE template _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const unexpected& __lhs, diff --git a/libcudacxx/include/cuda/std/__memory/construct_at.h b/libcudacxx/include/cuda/std/__memory/construct_at.h index bc231cd27d7..a78314c6479 100644 --- a/libcudacxx/include/cuda/std/__memory/construct_at.h +++ b/libcudacxx/include/cuda/std/__memory/construct_at.h @@ -50,6 +50,7 @@ # ifndef __cpp_lib_constexpr_dynamic_alloc namespace std { +_CCCL_EXEC_CHECK_DISABLE template ()) _Tp(_CUDA_VSTD::declval<_Args>()...))> diff --git a/libcudacxx/include/cuda/std/__utility/pair.h b/libcudacxx/include/cuda/std/__utility/pair.h index e725cf4b001..e8678f58767 100644 --- a/libcudacxx/include/cuda/std/__utility/pair.h +++ b/libcudacxx/include/cuda/std/__utility/pair.h @@ -124,6 +124,7 @@ struct __pair_base _T1 first; _T2 second; + _CCCL_EXEC_CHECK_DISABLE template , enable_if_t<_Constraints::__explicit_default_constructible, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __pair_base() noexcept( @@ -132,6 +133,7 @@ struct __pair_base , second() {} + _CCCL_EXEC_CHECK_DISABLE template , enable_if_t<_Constraints::__implicit_default_constructible, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base() noexcept( @@ -140,6 +142,7 @@ struct __pair_base , second() {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2)) @@ -163,6 +166,7 @@ struct __pair_base<_T1, _T2, true> _T1 first; _T2 second; + _CCCL_EXEC_CHECK_DISABLE template , enable_if_t<_Constraints::__explicit_default_constructible, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __pair_base() noexcept( @@ -171,6 +175,7 @@ struct __pair_base<_T1, _T2, true> , second() {} + _CCCL_EXEC_CHECK_DISABLE template , enable_if_t<_Constraints::__implicit_default_constructible, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base() noexcept( @@ -179,10 +184,13 @@ struct __pair_base<_T1, _T2, true> , second() {} + _CCCL_EXEC_CHECK_DISABLE _CCCL_HIDE_FROM_ABI constexpr __pair_base(const __pair_base&) = default; - _CCCL_HIDE_FROM_ABI constexpr __pair_base(__pair_base&&) = default; + _CCCL_EXEC_CHECK_DISABLE + _CCCL_HIDE_FROM_ABI constexpr __pair_base(__pair_base&&) = default; // We need to ensure that a reference type, which would inhibit the implicit copy assignment still works + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __pair_base& operator=( conditional_t<_CCCL_TRAIT(is_copy_assignable, _T1) && _CCCL_TRAIT(is_copy_assignable, _T2), __pair_base, __nat> const& __p) noexcept(_CCCL_TRAIT(is_nothrow_copy_assignable, _T1) && _CCCL_TRAIT(is_nothrow_copy_assignable, _T2)) @@ -193,6 +201,7 @@ struct __pair_base<_T1, _T2, true> } // We need to ensure that a reference type, which would inhibit the implicit move assignment still works + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __pair_base& operator=( conditional_t<_CCCL_TRAIT(is_move_assignable, _T1) && _CCCL_TRAIT(is_move_assignable, _T2), __pair_base, __nat>&& __p) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _T1) && _CCCL_TRAIT(is_nothrow_move_assignable, _T2)) @@ -202,6 +211,7 @@ struct __pair_base<_T1, _T2, true> return *this; } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept( _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2)) @@ -532,6 +542,7 @@ _CCCL_HOST_DEVICE pair(_T1, _T2) -> pair<_T1, _T2>; // [pairs.spec], specialized algorithms +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { @@ -540,6 +551,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const pair<_T1, #ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr common_comparison_category_t<__synth_three_way_result<_T1>, __synth_three_way_result<_T2>> @@ -554,30 +566,35 @@ operator<=>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) #else // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { return !(__x == __y); } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { return __x.first < __y.first || (!(__y.first < __x.first) && __x.second < __y.second); } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { return __y < __x; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { return !(__x < __y); } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) { diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/optional b/libcudacxx/include/cuda/std/detail/libcxx/include/optional index 04f056c91d3..d61ce254f4d 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/optional +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/optional @@ -296,12 +296,14 @@ struct __optional_destruct_base<_Tp, false> , __engaged_(false) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args) : __val_(_CUDA_VSTD::forward<_Args>(__args)...) , __engaged_(true) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base( __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args) @@ -338,12 +340,14 @@ struct __optional_destruct_base<_Tp, true> , __engaged_(false) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args) : __val_(_CUDA_VSTD::forward<_Args>(__args)...) , __engaged_(true) {} + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base( __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args) @@ -389,6 +393,7 @@ struct __optional_storage_base : __optional_destruct_base<_Tp> return _CUDA_VSTD::move(this->__val_); } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __construct(_Args&&... __args) { @@ -410,6 +415,7 @@ struct __optional_storage_base : __optional_destruct_base<_Tp> } } + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr void __assign_from(_That&& __opt) { @@ -811,6 +817,7 @@ public: return this->__get(); } + _CCCL_EXEC_CHECK_DISABLE _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(optional& __opt) noexcept( _CCCL_TRAIT(is_nothrow_move_constructible, value_type) && _CCCL_TRAIT(is_nothrow_swappable, value_type)) { @@ -1088,6 +1095,7 @@ _CCCL_HOST_DEVICE optional(_Tp) -> optional<_Tp>; # endif // _CCCL_NO_DEDUCTION_GUIDES // Comparisons between optionals +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() == declval()), bool), @@ -1105,6 +1113,7 @@ operator==(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x == *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() != declval()), bool), @@ -1122,6 +1131,7 @@ operator!=(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x != *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() < declval()), bool), @@ -1139,6 +1149,7 @@ operator<(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x < *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() > declval()), bool), @@ -1156,6 +1167,7 @@ operator>(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x > *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() <= declval()), bool), @@ -1173,6 +1185,7 @@ operator<=(const optional<_Tp>& __x, const optional<_Up>& __y) return *__x <= *__y; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() >= declval()), bool), @@ -1264,6 +1277,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(nullopt_t, const optional<_T } // Comparisons with T +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() == declval()), bool), @@ -1273,6 +1287,7 @@ operator==(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x == __v : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() == declval()), bool), @@ -1282,6 +1297,7 @@ operator==(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v == *__x : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() != declval()), bool), @@ -1291,6 +1307,7 @@ operator!=(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x != __v : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() != declval()), bool), @@ -1300,6 +1317,7 @@ operator!=(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v != *__x : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() < declval()), bool), @@ -1309,6 +1327,7 @@ operator<(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x < __v : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() < declval()), bool), @@ -1318,6 +1337,7 @@ operator<(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v < *__x : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() <= declval()), bool), @@ -1327,6 +1347,7 @@ operator<=(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x <= __v : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() <= declval()), bool), @@ -1336,6 +1357,7 @@ operator<=(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v <= *__x : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() > declval()), bool), @@ -1345,6 +1367,7 @@ operator>(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x > __v : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() > declval()), bool), @@ -1354,6 +1377,7 @@ operator>(const _Tp& __v, const optional<_Up>& __x) return static_cast(__x) ? __v > *__x : true; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() >= declval()), bool), @@ -1363,6 +1387,7 @@ operator>=(const optional<_Tp>& __x, const _Up& __v) return static_cast(__x) ? *__x >= __v : false; } +_CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t< _CCCL_TRAIT(is_convertible, decltype(declval() >= declval()), bool), diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple index aa2fdeaa368..6ff1039e61b 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple @@ -1124,6 +1124,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple<_Tp&&...> forward_as_tuple template struct __tuple_equal { + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y) { @@ -1157,6 +1158,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const tuple<_Tp. template struct __tuple_less { + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y) { diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/variant b/libcudacxx/include/cuda/std/detail/libcxx/include/variant index 0f6ec9d29fc..af1f7ba85ad 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/variant +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/variant @@ -255,6 +255,7 @@ C++20 #include #include #include +#include #include #include #include @@ -744,10 +745,22 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __alt { using __value_type = _Tp; + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __alt(in_place_t, _Args&&... __args) : __value(_CUDA_VSTD::forward<_Args>(__args)...) {} + _CCCL_EXEC_CHECK_DISABLE + constexpr __alt(const __alt&) = default; + _CCCL_EXEC_CHECK_DISABLE + constexpr __alt(__alt&&) = default; + _CCCL_EXEC_CHECK_DISABLE + constexpr __alt& operator=(const __alt&) = default; + _CCCL_EXEC_CHECK_DISABLE + constexpr __alt& operator=(__alt&&) = default; + + _CCCL_EXEC_CHECK_DISABLE + ~__alt() = default; __value_type __value; }; @@ -906,6 +919,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT __dtor<__traits<_Types...>, _Trait::_Availab { struct __visitor { + _CCCL_EXEC_CHECK_DISABLE template _LIBCUDACXX_HIDE_FROM_ABI void operator()(_Alt& __alt) const noexcept { @@ -1148,6 +1162,7 @@ public: } protected: + _CCCL_EXEC_CHECK_DISABLE template < size_t _Ip, class _Tp, @@ -1166,6 +1181,7 @@ protected: } } + _CCCL_EXEC_CHECK_DISABLE template < size_t _Ip, class _Tp, @@ -1896,7 +1912,11 @@ private: return __op(_CUDA_VSTD::get<0>(__lhs), _CUDA_VSTD::get<0>(__rhs)); } // We already checked that every variant has a value, so we should never reach this line +# if _CCCL_COMPILER(MSVC) // MSVC needs this to be wrapped in a function or it will error + _CUDA_VSTD::unreachable(); +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv _CCCL_UNREACHABLE(); +# endif // !_CCCL_COMPILER(MSVC) } }; diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp new file mode 100644 index 00000000000..ba972e02d3a --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp @@ -0,0 +1,201 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// We cannot suppress execution checks in cuda::std::construct_at +// XFAIL: c++20 && !nvrtc && nvcc && !msvc +// UNSUPPORTED: clang-14 + +#include +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using expected = cuda::std::expected; + { // default construction + expected default_constructed{}; + assert(default_constructed.has_value()); + assert(*default_constructed == 0); + } + + { // in_place zero initialization + expected in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.has_value()); + assert(*in_place_zero_initialization == 0); + } + + { // in_place initialization + expected in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.has_value()); + assert(*in_place_initialization == 42); + } + + { // initializer_list initialization + expected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list{}, 42}; + assert(init_list_initialization.has_value()); + assert(*init_list_initialization == 42); + } + + { // unexpect zero initialization + expected in_place_zero_initialization{cuda::std::unexpect}; + assert(!in_place_zero_initialization.has_value()); + assert(in_place_zero_initialization.error() == 0); + } + + { // unexpect initialization + expected in_place_initialization{cuda::std::unexpect, 42}; + assert(!in_place_initialization.has_value()); + assert(in_place_initialization.error() == 42); + } + + { // initializer_list initialization + expected init_list_initialization{cuda::std::unexpect, cuda::std::initializer_list{}, 42}; + assert(!init_list_initialization.has_value()); + assert(init_list_initialization.error() == 42); + } + + { // value initialization + expected value_initialization{42}; + assert(value_initialization.has_value()); + assert(*value_initialization == 42); + } + + { // copy construction + expected input{42}; + expected dest{input}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // move construction + expected input{42}; + expected dest{cuda::std::move(input)}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to value + expected input{42}; + expected dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to empty + expected input{42}; + expected dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, empty to value + expected input{}; + expected dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 0); + } + + { // assignment, empty to empty + expected input{}; + expected dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 0); + } + + { // assignment, error to value + expected input{cuda::std::unexpect, 42}; + expected dest{1337}; + dest = input; + assert(!dest.has_value()); + assert(dest.error() == 42); + } + + { // assignment, value to error + expected input{42}; + expected dest{cuda::std::unexpect, 1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, error to error + expected input{cuda::std::unexpect, 42}; + expected dest{cuda::std::unexpect, 1337}; + dest = input; + assert(!dest.has_value()); + assert(dest.error() == 42); + } + + { // comparison with expected with value + expected lhs{42}; + expected rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // comparison with expected with error + expected lhs{cuda::std::unexpect, 42}; + expected rhs{cuda::std::unexpect, 1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // comparison with type and value + expected expect{42}; + assert(expect == device_only_type{42}); + assert(device_only_type{42} == expect); + assert(expect != device_only_type{1337}); + assert(device_only_type{1337} != expect); + } + + { // comparison with type and error + expected expect{cuda::std::unexpect, 42}; + assert(expect == cuda::std::unexpected{42}); + assert(cuda::std::unexpected{42} == expect); + assert(expect != cuda::std::unexpected{1337}); + assert(cuda::std::unexpected{1337} != expect); + } + + { // swap + expected lhs{42}; + expected rhs{1337}; + lhs.swap(rhs); + assert(*lhs == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(*rhs == 1337); + } + + { // swap cross error + expected lhs{42}; + expected rhs{cuda::std::unexpect, 1337}; + lhs.swap(rhs); + assert(lhs.error() == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(rhs.error() == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp new file mode 100644 index 00000000000..282288b7be8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp @@ -0,0 +1,199 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using expected = cuda::std::expected; + { // default construction + expected default_constructed{}; + assert(default_constructed.has_value()); + assert(*default_constructed == 0); + } + + { // in_place zero initialization + expected in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.has_value()); + assert(*in_place_zero_initialization == 0); + } + + { // in_place initialization + expected in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.has_value()); + assert(*in_place_initialization == 42); + } + + { // initializer_list initialization + expected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list{}, 42}; + assert(init_list_initialization.has_value()); + assert(*init_list_initialization == 42); + } + + { // unexpect zero initialization + expected in_place_zero_initialization{cuda::std::unexpect}; + assert(!in_place_zero_initialization.has_value()); + assert(in_place_zero_initialization.error() == 0); + } + + { // unexpect initialization + expected in_place_initialization{cuda::std::unexpect, 42}; + assert(!in_place_initialization.has_value()); + assert(in_place_initialization.error() == 42); + } + + { // initializer_list initialization + expected init_list_initialization{cuda::std::unexpect, cuda::std::initializer_list{}, 42}; + assert(!init_list_initialization.has_value()); + assert(init_list_initialization.error() == 42); + } + + { // value initialization + expected value_initialization{42}; + assert(value_initialization.has_value()); + assert(*value_initialization == 42); + } + + { // copy construction + expected input{42}; + expected dest{input}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // move construction + expected input{42}; + expected dest{cuda::std::move(input)}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to value + expected input{42}; + expected dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to empty + expected input{42}; + expected dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, empty to value + expected input{}; + expected dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 0); + } + + { // assignment, empty to empty + expected input{}; + expected dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 0); + } + + { // assignment, error to value + expected input{cuda::std::unexpect, 42}; + expected dest{1337}; + dest = input; + assert(!dest.has_value()); + assert(dest.error() == 42); + } + + { // assignment, value to error + expected input{42}; + expected dest{cuda::std::unexpect, 1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, error to error + expected input{cuda::std::unexpect, 42}; + expected dest{cuda::std::unexpect, 1337}; + dest = input; + assert(!dest.has_value()); + assert(dest.error() == 42); + } + + { // comparison with expected with value + expected lhs{42}; + expected rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // comparison with expected with error + expected lhs{cuda::std::unexpect, 42}; + expected rhs{cuda::std::unexpect, 1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // comparison with type and value + expected expect{42}; + assert(expect == host_only_type{42}); + assert(host_only_type{42} == expect); + assert(expect != host_only_type{1337}); + assert(host_only_type{1337} != expect); + } + + { // comparison with type and error + expected expect{cuda::std::unexpect, 42}; + assert(expect == cuda::std::unexpected{42}); + assert(cuda::std::unexpected{42} == expect); + assert(expect != cuda::std::unexpected{1337}); + assert(cuda::std::unexpected{1337} != expect); + } + + { // swap + expected lhs{42}; + expected rhs{1337}; + lhs.swap(rhs); + assert(*lhs == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(*rhs == 1337); + } + + { // swap cross error + expected lhs{42}; + expected rhs{cuda::std::unexpect, 1337}; + lhs.swap(rhs); + assert(lhs.error() == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(rhs.error() == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp new file mode 100644 index 00000000000..766b6ae821c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp @@ -0,0 +1,136 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// We cannot suppress execution checks in cuda::std::construct_at +// XFAIL: c++20 && !nvrtc && nvcc && !msvc +// UNSUPPORTED: clang-14 + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using optional = cuda::std::optional; + { // default construction + optional default_constructed{}; + assert(!default_constructed.has_value()); + } + + { // in_place zero initialization + optional in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.has_value()); + assert(*in_place_zero_initialization == 0); + } + + { // in_place initialization + optional in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.has_value()); + assert(*in_place_initialization == 42); + } + + { // value initialization + optional value_initialization{42}; + assert(value_initialization.has_value()); + assert(*value_initialization == 42); + } + + { // copy construction + optional input{42}; + optional dest{input}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // move construction + optional input{42}; + optional dest{cuda::std::move(input)}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to value + optional input{42}; + optional dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to empty + optional input{42}; + optional dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, empty to value + optional input{}; + optional dest{1337}; + dest = input; + assert(!dest.has_value()); + } + + { // assignment, empty to empty + optional input{}; + optional dest{}; + dest = input; + assert(!dest.has_value()); + } + + { // comparison with optional + optional lhs{42}; + optional rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // comparison with type + optional opt{42}; + assert(opt == device_only_type{42}); + assert(device_only_type{42} == opt); + assert(opt != device_only_type{1337}); + assert(device_only_type{1337} != opt); + + assert(opt < device_only_type{1337}); + assert(device_only_type{7} < opt); + assert(opt <= device_only_type{1337}); + assert(device_only_type{7} <= opt); + + assert(opt > device_only_type{7}); + assert(device_only_type{1337} > opt); + assert(opt >= device_only_type{7}); + assert(device_only_type{1337} >= opt); + } + + { // swap + optional lhs{42}; + optional rhs{1337}; + lhs.swap(rhs); + assert(*lhs == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(*rhs == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp new file mode 100644 index 00000000000..3bf26d0fb2e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp @@ -0,0 +1,134 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using optional = cuda::std::optional; + { // default construction + optional default_constructed{}; + assert(!default_constructed.has_value()); + } + + { // in_place zero initialization + optional in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.has_value()); + assert(*in_place_zero_initialization == 0); + } + + { // in_place initialization + optional in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.has_value()); + assert(*in_place_initialization == 42); + } + + { // value initialization + optional value_initialization{42}; + assert(value_initialization.has_value()); + assert(*value_initialization == 42); + } + + { // copy construction + optional input{42}; + optional dest{input}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // move construction + optional input{42}; + optional dest{cuda::std::move(input)}; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to value + optional input{42}; + optional dest{1337}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, value to empty + optional input{42}; + optional dest{}; + dest = input; + assert(dest.has_value()); + assert(*dest == 42); + } + + { // assignment, empty to value + optional input{}; + optional dest{1337}; + dest = input; + assert(!dest.has_value()); + } + + { // assignment, empty to empty + optional input{}; + optional dest{}; + dest = input; + assert(!dest.has_value()); + } + + { // comparison with optional + optional lhs{42}; + optional rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // comparison with type + optional opt{42}; + assert(opt == host_only_type{42}); + assert(host_only_type{42} == opt); + assert(opt != host_only_type{1337}); + assert(host_only_type{1337} != opt); + + assert(opt < host_only_type{1337}); + assert(host_only_type{7} < opt); + assert(opt <= host_only_type{1337}); + assert(host_only_type{7} <= opt); + + assert(opt > host_only_type{7}); + assert(host_only_type{1337} > opt); + assert(opt >= host_only_type{7}); + assert(host_only_type{1337} >= opt); + } + + { // swap + optional lhs{42}; + optional rhs{1337}; + lhs.swap(rhs); + assert(*lhs == 1337); + assert(*rhs == 42); + + swap(lhs, rhs); + assert(*lhs == 42); + assert(*rhs == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp new file mode 100644 index 00000000000..d8820409d10 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using tuple = cuda::std::tuple; + { // default construction + tuple default_constructed{}; + assert(cuda::std::get<0>(default_constructed) == 0); + } + + { // value initialization + tuple value_initialization{device_only_type{42}}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // value initialization + tuple value_initialization{42}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // copy construction + tuple input{42}; + tuple dest{input}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // move construction + tuple input{42}; + tuple dest{cuda::std::move(input)}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to value + tuple input{42}; + tuple dest{1337}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // comparison with tuple + tuple lhs{42}; + tuple rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + tuple lhs{42}; + tuple rhs{1337}; + lhs.swap(rhs); + assert(cuda::std::get<0>(lhs) == 1337); + assert(cuda::std::get<0>(rhs) == 42); + + swap(lhs, rhs); + assert(cuda::std::get<0>(lhs) == 42); + assert(cuda::std::get<0>(rhs) == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/forward_as_tuple_interop.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/forward_as_tuple_interop.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/forward_as_tuple_interop.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/forward_as_tuple_interop.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp new file mode 100644 index 00000000000..4942d051b1c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using tuple = cuda::std::tuple; + { // default construction + tuple default_constructed{}; + assert(cuda::std::get<0>(default_constructed) == 0); + } + + { // value initialization + tuple value_initialization{host_only_type{42}}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // value initialization + tuple value_initialization{42}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // copy construction + tuple input{42}; + tuple dest{input}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // move construction + tuple input{42}; + tuple dest{cuda::std::move(input)}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to value + tuple input{42}; + tuple dest{1337}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to empty + tuple input{42}; + tuple dest{}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // comparison with tuple + tuple lhs{42}; + tuple rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + tuple lhs{42}; + tuple rhs{1337}; + lhs.swap(rhs); + assert(cuda::std::get<0>(lhs) == 1337); + assert(cuda::std::get<0>(rhs) == 42); + + swap(lhs, rhs); + assert(cuda::std::get<0>(lhs) == 42); + assert(cuda::std::get<0>(rhs) == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_get.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_get.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_get.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_get.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_structured_bindings.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_structured_bindings.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_structured_bindings.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_structured_bindings.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_element.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_element.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_element.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_element.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_size.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_size.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_size.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_size.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp new file mode 100644 index 00000000000..f36e86c2c3f --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using unexpected = cuda::std::unexpected; + { // in_place zero initialization + unexpected in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.error() == 0); + } + + { // in_place initialization + unexpected in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.error() == 42); + } + + { // value initialization + unexpected value_initialization{42}; + assert(value_initialization.error() == 42); + } + + { // initializer_list initialization + unexpected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list{}, 42}; + assert(init_list_initialization.error() == 42); + } + + { // copy construction + unexpected input{42}; + unexpected dest{input}; + assert(dest.error() == 42); + } + + { // move construction + unexpected input{42}; + unexpected dest{cuda::std::move(input)}; + assert(dest.error() == 42); + } + + { // assignment + unexpected input{42}; + unexpected dest{1337}; + dest = input; + assert(dest.error() == 42); + } + + { // comparison with unexpected + unexpected lhs{42}; + unexpected rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // swap + unexpected lhs{42}; + unexpected rhs{1337}; + lhs.swap(rhs); + assert(lhs.error() == 1337); + assert(rhs.error() == 42); + + swap(lhs, rhs); + assert(lhs.error() == 42); + assert(rhs.error() == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp new file mode 100644 index 00000000000..ca12494418c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using unexpected = cuda::std::unexpected; + { // in_place zero initialization + unexpected in_place_zero_initialization{cuda::std::in_place}; + assert(in_place_zero_initialization.error() == 0); + } + + { // in_place initialization + unexpected in_place_initialization{cuda::std::in_place, 42}; + assert(in_place_initialization.error() == 42); + } + + { // value initialization + unexpected value_initialization{42}; + assert(value_initialization.error() == 42); + } + + { // initializer_list initialization + unexpected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list{}, 42}; + assert(init_list_initialization.error() == 42); + } + + { // copy construction + unexpected input{42}; + unexpected dest{input}; + assert(dest.error() == 42); + } + + { // move construction + unexpected input{42}; + unexpected dest{cuda::std::move(input)}; + assert(dest.error() == 42); + } + + { // assignment + unexpected input{42}; + unexpected dest{1337}; + dest = input; + assert(dest.error() == 42); + } + + { // comparison with unexpected + unexpected lhs{42}; + unexpected rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + } + + { // swap + unexpected lhs{42}; + unexpected rhs{1337}; + lhs.swap(rhs); + assert(lhs.error() == 1337); + assert(rhs.error() == 42); + + swap(lhs, rhs); + assert(lhs.error() == 42); + assert(rhs.error() == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp new file mode 100644 index 00000000000..aebdd6e12ea --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using pair = cuda::std::pair; + { // default construction + pair default_constructed{}; + assert(default_constructed.first == 0); + assert(default_constructed.second == 0); + } + + { // value initialization + pair value_initialization{device_only_type{42}, device_only_type{1337}}; + assert(value_initialization.first == 42); + assert(value_initialization.second == 1337); + } + + { // value initialization + pair value_initialization{42, 1337}; + assert(value_initialization.first == 42); + assert(value_initialization.second == 1337); + } + + { // copy construction + pair input{42, 1337}; + pair dest{input}; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // move construction + pair input{42, 1337}; + pair dest{cuda::std::move(input)}; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // assignment, value to value + pair input{42, 1337}; + pair dest{1337, 42}; + dest = input; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // comparison with pair + pair lhs{42, 1337}; + pair rhs{1337, 42}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + pair lhs{42, 1337}; + pair rhs{1337, 42}; + lhs.swap(rhs); + assert(lhs.first == 1337); + assert(lhs.second == 42); + assert(rhs.first == 42); + assert(rhs.second == 1337); + + swap(lhs, rhs); + assert(lhs.first == 42); + assert(lhs.second == 1337); + assert(rhs.first == 1337); + assert(rhs.second == 42); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp new file mode 100644 index 00000000000..cf1195f204d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using pair = cuda::std::pair; + { // default construction + pair default_constructed{}; + assert(default_constructed.first == 0); + assert(default_constructed.second == 0); + } + + { // value initialization + pair value_initialization{host_only_type{42}, host_only_type{1337}}; + assert(value_initialization.first == 42); + assert(value_initialization.second == 1337); + } + + { // value initialization + pair value_initialization{42, 1337}; + assert(value_initialization.first == 42); + assert(value_initialization.second == 1337); + } + + { // copy construction + pair input{42, 1337}; + pair dest{input}; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // move construction + pair input{42, 1337}; + pair dest{cuda::std::move(input)}; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // assignment, value to value + pair input{42, 1337}; + pair dest{1337, 42}; + dest = input; + assert(dest.first == 42); + assert(dest.second == 1337); + } + + { // comparison with pair + pair lhs{42, 1337}; + pair rhs{1337, 42}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + pair lhs{42, 1337}; + pair rhs{1337, 42}; + lhs.swap(rhs); + assert(lhs.first == 1337); + assert(lhs.second == 42); + assert(rhs.first == 42); + assert(rhs.second == 1337); + + swap(lhs, rhs); + assert(lhs.first == 42); + assert(lhs.second == 1337); + assert(rhs.first == 1337); + assert(rhs.second == 42); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/pair_interop/pair.assign.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.assign.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/pair_interop/pair.assign.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.assign.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/pair_interop/pair.cons.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.cons.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/pair_interop/pair.cons.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.cons.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/pair_interop/pair.conv.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.conv.pass.cpp similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/pair_interop/pair.conv.pass.cpp rename to libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.conv.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp new file mode 100644 index 00000000000..38ee416a8fc --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp @@ -0,0 +1,120 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +__device__ void test() +{ + using variant = cuda::std::variant; + { // default construction + variant default_constructed{}; + assert(cuda::std::get<0>(default_constructed) == 0); + } + + { // value initialization + variant value_initialization{device_only_type{42}}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // value initialization + variant value_initialization{42}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // in_place_type_t initialization + variant in_place_initialization{cuda::std::in_place_type_t{}, 42}; + assert(cuda::std::get<0>(in_place_initialization) == 42); + } + + { // in_place_index_t initialization + variant in_place_initialization{cuda::std::in_place_index_t<0>{}, 42}; + assert(cuda::std::get<0>(in_place_initialization) == 42); + } + + { // in_place_type_t initializer_list initialization + variant init_list_initialization{ + cuda::std::in_place_type_t{}, cuda::std::initializer_list{}, 42}; + assert(cuda::std::get<0>(init_list_initialization) == 42); + } + + { // in_place_type_t initializer_list initialization + variant init_list_initialization{cuda::std::in_place_index_t<0>{}, cuda::std::initializer_list{}, 42}; + assert(cuda::std::get<0>(init_list_initialization) == 42); + } + + { // copy construction + variant input{42}; + variant dest{input}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // move construction + variant input{42}; + variant dest{cuda::std::move(input)}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to value + variant input{42}; + variant dest{1337}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // emplace + variant var{42}; + var.emplace(42); + assert(cuda::std::get<0>(var) == 42); + } + + { // emplace + variant var{42}; + var.emplace<0>(42); + assert(cuda::std::get<0>(var) == 42); + } + + { // emplace init list + variant var{42}; + var.emplace(cuda::std::initializer_list{}, 42); + assert(cuda::std::get<0>(var) == 42); + } + + { // comparison with variant + variant lhs{42}; + variant rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + variant lhs{42}; + variant rhs{1337}; + lhs.swap(rhs); + assert(cuda::std::get<0>(lhs) == 1337); + assert(cuda::std::get<0>(rhs) == 42); + + swap(lhs, rhs); + assert(cuda::std::get<0>(lhs) == 42); + assert(cuda::std::get<0>(rhs) == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_DEVICE, (test();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp new file mode 100644 index 00000000000..5f12da6074b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp @@ -0,0 +1,129 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: nvrtc + +#include +#include + +#include "host_device_types.h" +#include "test_macros.h" + +void test() +{ + using variant = cuda::std::variant; + { // default construction + variant default_constructed{}; + assert(cuda::std::get<0>(default_constructed) == 0); + } + + { // value initialization + variant value_initialization{host_only_type{42}}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // value initialization + variant value_initialization{42}; + assert(cuda::std::get<0>(value_initialization) == 42); + } + + { // in_place_type_t initialization + variant in_place_initialization{cuda::std::in_place_type_t{}, 42}; + assert(cuda::std::get<0>(in_place_initialization) == 42); + } + + { // in_place_index_t initialization + variant in_place_initialization{cuda::std::in_place_index_t<0>{}, 42}; + assert(cuda::std::get<0>(in_place_initialization) == 42); + } + + { // in_place_type_t initializer_list initialization + variant init_list_initialization{ + cuda::std::in_place_type_t{}, cuda::std::initializer_list{}, 42}; + assert(cuda::std::get<0>(init_list_initialization) == 42); + } + + { // in_place_type_t initializer_list initialization + variant init_list_initialization{cuda::std::in_place_index_t<0>{}, cuda::std::initializer_list{}, 42}; + assert(cuda::std::get<0>(init_list_initialization) == 42); + } + + { // copy construction + variant input{42}; + variant dest{input}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // move construction + variant input{42}; + variant dest{cuda::std::move(input)}; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to value + variant input{42}; + variant dest{1337}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // assignment, value to empty + variant input{42}; + variant dest{}; + dest = input; + assert(cuda::std::get<0>(dest) == 42); + } + + { // emplace + variant var{42}; + var.emplace(42); + assert(cuda::std::get<0>(var) == 42); + } + + { // emplace + variant var{42}; + var.emplace<0>(42); + assert(cuda::std::get<0>(var) == 42); + } + + { // emplace init list + variant var{42}; + var.emplace(cuda::std::initializer_list{}, 42); + assert(cuda::std::get<0>(var) == 42); + } + + { // comparison with variant + variant lhs{42}; + variant rhs{1337}; + assert(!(lhs == rhs)); + assert(lhs != rhs); + assert(lhs < rhs); + assert(lhs <= rhs); + assert(!(lhs > rhs)); + assert(!(lhs >= rhs)); + } + + { // swap + variant lhs{42}; + variant rhs{1337}; + lhs.swap(rhs); + assert(cuda::std::get<0>(lhs) == 1337); + assert(cuda::std::get<0>(rhs) == 42); + + swap(lhs, rhs); + assert(cuda::std::get<0>(lhs) == 42); + assert(cuda::std::get<0>(rhs) == 1337); + } +} + +int main(int arg, char** argv) +{ + NV_IF_TARGET(NV_IS_HOST, (test();)) + return 0; +} diff --git a/libcudacxx/test/support/host_device_types.h b/libcudacxx/test/support/host_device_types.h new file mode 100644 index 00000000000..e8fa21b85b9 --- /dev/null +++ b/libcudacxx/test/support/host_device_types.h @@ -0,0 +1,148 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_SUPPORT_HOST_DEVICE_TYPES +#define TEST_SUPPORT_HOST_DEVICE_TYPES + +#include +#include + +#if !_CCCL_COMPILER(NVRTC) +struct host_only_type +{ + int val_; + + host_only_type(const int val = 0) noexcept + : val_(val) + {} + host_only_type(cuda::std::initializer_list, const int val) noexcept + : val_(val) + {} + + host_only_type(const host_only_type& other) noexcept + : val_(other.val_) + {} + host_only_type(host_only_type&& other) noexcept + : val_(cuda::std::exchange(other.val_, -1)) + {} + + host_only_type& operator=(const host_only_type& other) noexcept + { + val_ = other.val_; + return *this; + } + + host_only_type& operator=(host_only_type&& other) noexcept + + { + val_ = cuda::std::exchange(other.val_, -1); + return *this; + } + + ~host_only_type() noexcept {} + + _CCCL_NODISCARD_FRIEND bool operator==(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ == rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator!=(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ != rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator<(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ < rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator<=(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ <= rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator>(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ > rhs.val_; + } + _CCCL_NODISCARD_FRIEND bool operator>=(const host_only_type& lhs, const host_only_type& rhs) noexcept + { + return lhs.val_ >= rhs.val_; + } + + void swap(host_only_type& other) noexcept + { + cuda::std::swap(val_, other.val_); + } +}; +#endif // !_CCCL_COMPILER(NVRTC) + +#if _CCCL_HAS_CUDA_COMPILER +struct device_only_type +{ + int val_; + + __device__ device_only_type(const int val = 0) noexcept + : val_(val) + {} + __device__ device_only_type(cuda::std::initializer_list, const int val) noexcept + : val_(val) + {} + + __device__ device_only_type(const device_only_type& other) noexcept + : val_(other.val_) + {} + __device__ device_only_type(device_only_type&& other) noexcept + : val_(cuda::std::exchange(other.val_, -1)) + {} + + __device__ device_only_type& operator=(const device_only_type& other) noexcept + { + val_ = other.val_; + return *this; + } + + __device__ device_only_type& operator=(device_only_type&& other) noexcept + + { + val_ = cuda::std::exchange(other.val_, -1); + return *this; + } + + __device__ ~device_only_type() noexcept {} + + __device__ _CCCL_NODISCARD_FRIEND bool operator==(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ == rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator!=(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ != rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator<(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ < rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator<=(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ <= rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator>(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ > rhs.val_; + } + __device__ _CCCL_NODISCARD_FRIEND bool operator>=(const device_only_type& lhs, const device_only_type& rhs) noexcept + { + return lhs.val_ >= rhs.val_; + } + + __device__ void swap(device_only_type& other) noexcept + { + cuda::std::swap(val_, other.val_); + } +}; +#endif // _CCCL_HAS_CUDA_COMPILER + +#endif // TEST_SUPPORT_HOST_DEVICE_TYPES From 8615f321e6305a1dbbd72b8050c47e4e6b27790f Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Thu, 30 Jan 2025 00:09:17 -0800 Subject: [PATCH 15/33] [nv/target] Add sm_120 macros. (#3550) Co-authored-by: Bernhard Manfred Gruber --- libcudacxx/include/nv/detail/__target_macros | 21 ++++++++++++++++++++ libcudacxx/include/nv/target | 9 +++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/nv/detail/__target_macros b/libcudacxx/include/nv/detail/__target_macros index 85df652c7d4..2de10fc8ec4 100644 --- a/libcudacxx/include/nv/detail/__target_macros +++ b/libcudacxx/include/nv/detail/__target_macros @@ -35,6 +35,7 @@ #define _NV_TARGET_ARCH_TO_SELECTOR_900 nv::target::sm_90 #define _NV_TARGET_ARCH_TO_SELECTOR_1000 nv::target::sm_100 #define _NV_TARGET_ARCH_TO_SELECTOR_1010 nv::target::sm_101 +#define _NV_TARGET_ARCH_TO_SELECTOR_1200 nv::target::sm_120 #define _NV_TARGET_ARCH_TO_SM_350 35 #define _NV_TARGET_ARCH_TO_SM_370 37 @@ -54,6 +55,7 @@ #define _NV_TARGET_ARCH_TO_SM_900 90 #define _NV_TARGET_ARCH_TO_SM_1000 100 #define _NV_TARGET_ARCH_TO_SM_1010 101 +#define _NV_TARGET_ARCH_TO_SM_1200 120 // Only enable when compiling for CUDA/stdpar #if defined(_NV_COMPILER_NVCXX) && defined(_NVHPC_CUDA) @@ -76,6 +78,7 @@ # define _NV_TARGET_VAL_SM_90 nv::target::sm_90 # define _NV_TARGET_VAL_SM_100 nv::target::sm_100 # define _NV_TARGET_VAL_SM_101 nv::target::sm_101 +# define _NV_TARGET_VAL_SM_120 nv::target::sm_120 # define _NV_TARGET___NV_IS_HOST nv::target::is_host # define _NV_TARGET___NV_IS_DEVICE nv::target::is_device @@ -112,6 +115,7 @@ # define _NV_TARGET_VAL_SM_90 900 # define _NV_TARGET_VAL_SM_100 1000 # define _NV_TARGET_VAL_SM_101 1010 +# define _NV_TARGET_VAL_SM_120 1200 # if defined(__CUDA_ARCH__) # define _NV_TARGET_VAL __CUDA_ARCH__ @@ -160,6 +164,7 @@ # define _NV_TARGET_VAL_SM_90 900 # define _NV_TARGET_VAL_SM_100 1000 # define _NV_TARGET_VAL_SM_101 1010 +# define _NV_TARGET_VAL_SM_120 1200 # define _NV_TARGET_VAL 0 @@ -191,6 +196,7 @@ #define _NV_TARGET___NV_PROVIDES_SM_90 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_90)) #define _NV_TARGET___NV_PROVIDES_SM_100 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_100)) #define _NV_TARGET___NV_PROVIDES_SM_101 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_101)) +#define _NV_TARGET___NV_PROVIDES_SM_120 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_120)) #define _NV_TARGET___NV_IS_EXACTLY_SM_35 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_35)) #define _NV_TARGET___NV_IS_EXACTLY_SM_37 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_37)) @@ -210,6 +216,7 @@ #define _NV_TARGET___NV_IS_EXACTLY_SM_90 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_90)) #define _NV_TARGET___NV_IS_EXACTLY_SM_100 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_100)) #define _NV_TARGET___NV_IS_EXACTLY_SM_101 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_101)) +#define _NV_TARGET___NV_IS_EXACTLY_SM_120 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_120)) #define NV_PROVIDES_SM_35 __NV_PROVIDES_SM_35 #define NV_PROVIDES_SM_37 __NV_PROVIDES_SM_37 @@ -229,6 +236,7 @@ #define NV_PROVIDES_SM_90 __NV_PROVIDES_SM_90 #define NV_PROVIDES_SM_100 __NV_PROVIDES_SM_100 #define NV_PROVIDES_SM_101 __NV_PROVIDES_SM_101 +#define NV_PROVIDES_SM_120 __NV_PROVIDES_SM_120 #define NV_IS_EXACTLY_SM_35 __NV_IS_EXACTLY_SM_35 #define NV_IS_EXACTLY_SM_37 __NV_IS_EXACTLY_SM_37 @@ -248,6 +256,7 @@ #define NV_IS_EXACTLY_SM_90 __NV_IS_EXACTLY_SM_90 #define NV_IS_EXACTLY_SM_100 __NV_IS_EXACTLY_SM_100 #define NV_IS_EXACTLY_SM_101 __NV_IS_EXACTLY_SM_101 +#define NV_IS_EXACTLY_SM_120 __NV_IS_EXACTLY_SM_120 // Disable SM_90a support on non-supporting compilers. // Will re-enable for nvcc below. @@ -381,6 +390,12 @@ # define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_101 0 # endif +# if (_NV_TARGET___NV_IS_EXACTLY_SM_120) +# define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_120 1 +# else +# define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_120 0 +# endif + // Re-enable sm_90a support in nvcc. # undef NV_HAS_FEATURE_SM_90a # define NV_HAS_FEATURE_SM_90a __NV_HAS_FEATURE_SM_90a @@ -529,6 +544,12 @@ # define _NV_TARGET_BOOL___NV_PROVIDES_SM_101 0 # endif +# if (_NV_TARGET___NV_PROVIDES_SM_120) +# define _NV_TARGET_BOOL___NV_PROVIDES_SM_120 1 +# else +# define _NV_TARGET_BOOL___NV_PROVIDES_SM_120 0 +# endif + # define _NV_ARCH_COND_CAT1(cond) _NV_TARGET_BOOL_##cond # define _NV_ARCH_COND_CAT(cond) _NV_EVAL(_NV_ARCH_COND_CAT1(cond)) diff --git a/libcudacxx/include/nv/target b/libcudacxx/include/nv/target index d8617220c84..4b77011243f 100644 --- a/libcudacxx/include/nv/target +++ b/libcudacxx/include/nv/target @@ -68,9 +68,10 @@ constexpr base_int_t sm_89_bit = 1 << 15; constexpr base_int_t sm_90_bit = 1 << 16; constexpr base_int_t sm_100_bit = 1 << 17; constexpr base_int_t sm_101_bit = 1 << 18; +constexpr base_int_t sm_120_bit = 1 << 19; constexpr base_int_t all_devices = sm_35_bit | sm_37_bit | sm_50_bit | sm_52_bit | sm_53_bit | sm_60_bit | sm_61_bit | sm_62_bit | sm_70_bit | sm_72_bit - | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit | sm_100_bit | sm_101_bit; + | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit | sm_100_bit | sm_101_bit | sm_120_bit; // Store a set of targets as a set of bits struct _NV_BITSET_ATTRIBUTE target_description @@ -103,6 +104,7 @@ enum class sm_selector : base_int_t sm_90 = 90, sm_100 = 100, sm_101 = 101, + sm_120 = 120, }; constexpr base_int_t toint(sm_selector a) @@ -130,12 +132,14 @@ constexpr base_int_t bitexact(sm_selector a) : toint(a) == 90 ? sm_90_bit : toint(a) == 100 ? sm_100_bit : toint(a) == 101 ? sm_101_bit + : toint(a) == 120 ? sm_120_bit : 0; } constexpr base_int_t bitrounddown(sm_selector a) { - return toint(a) >= 101 ? sm_101_bit + return toint(a) >= 120 ? sm_120_bit + : toint(a) >= 101 ? sm_101_bit : toint(a) >= 100 ? sm_100_bit : toint(a) >= 90 ? sm_90_bit : toint(a) >= 89 ? sm_89_bit @@ -214,6 +218,7 @@ constexpr sm_selector sm_89 = sm_selector::sm_89; constexpr sm_selector sm_90 = sm_selector::sm_90; constexpr sm_selector sm_100 = sm_selector::sm_100; constexpr sm_selector sm_101 = sm_selector::sm_101; +constexpr sm_selector sm_120 = sm_selector::sm_120; using detail::is_exactly; using detail::provides; From 3e888d8fd7953d595af016eacd89af610fb624e6 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 09:10:00 +0100 Subject: [PATCH 16/33] PTX: Remove internal instructions (#3583) * barrier.cluster.aligned: Remove This is not supposed to be exposed in CCCL. * elect.sync: Remove Not ready for inclusion yet. This needs to handle the optional extra output mask as well. * mapa: Remove This has compiler bugs. We should use intrinsics instead. Co-authored-by: Allard Hendriksen --- .../generated/barrier_cluster_aligned.rst | 63 --------- .../ptx/instructions/generated/elect_sync.rst | 11 -- .../ptx/instructions/generated/mapa.rst | 14 -- .../generated/barrier_cluster_aligned.h | 130 ------------------ .../__ptx/instructions/generated/elect_sync.h | 36 ----- .../cuda/__ptx/instructions/generated/mapa.h | 33 ----- .../ptx/generated/barrier_cluster_aligned.h | 61 -------- .../cuda/ptx/generated/elect_sync.h | 26 ---- .../test/libcudacxx/cuda/ptx/generated/mapa.h | 27 ---- 9 files changed, 401 deletions(-) delete mode 100644 docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst delete mode 100644 docs/libcudacxx/ptx/instructions/generated/elect_sync.rst delete mode 100644 docs/libcudacxx/ptx/instructions/generated/mapa.rst delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst deleted file mode 100644 index a24093ac7b6..00000000000 --- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst +++ /dev/null @@ -1,63 +0,0 @@ -.. - This file was automatically generated. Do not edit. - -barrier.cluster.arrive.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_arrive( - cuda::ptx::dot_aligned_t); - -barrier.cluster.wait.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_wait( - cuda::ptx::dot_aligned_t); - -barrier.cluster.arrive.release.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 - // .sem = { .release } - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::dot_aligned_t); - -barrier.cluster.arrive.relaxed.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 - // .sem = { .relaxed } - // .aligned = { .aligned } - // Marked volatile - template - __device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_relaxed_t, - cuda::ptx::dot_aligned_t); - -barrier.cluster.wait.acquire.aligned -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 - // .sem = { .acquire } - // .aligned = { .aligned } - // Marked volatile and as clobbering memory - template - __device__ static inline void barrier_cluster_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::dot_aligned_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst deleted file mode 100644 index bc909c54319..00000000000 --- a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. - This file was automatically generated. Do not edit. - -elect.sync -^^^^^^^^^^ -.. code:: cuda - - // elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 - template - __device__ static inline bool elect_sync( - const uint32_t& membermask); diff --git a/docs/libcudacxx/ptx/instructions/generated/mapa.rst b/docs/libcudacxx/ptx/instructions/generated/mapa.rst deleted file mode 100644 index 4ffc70d85d9..00000000000 --- a/docs/libcudacxx/ptx/instructions/generated/mapa.rst +++ /dev/null @@ -1,14 +0,0 @@ -.. - This file was automatically generated. Do not edit. - -mapa.shared::cluster.u32 -^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 - // .space = { .shared::cluster } - template - __device__ static inline Tp* mapa( - cuda::ptx::space_cluster_t, - const Tp* addr, - uint32_t target_cta); diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h deleted file mode 100644 index 80fe3796e69..00000000000 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h +++ /dev/null @@ -1,130 +0,0 @@ -// This file was automatically generated. Do not edit. - -#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ -#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ - -/* -// barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(dot_aligned_t) -{ -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.arrive.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait( - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait(dot_aligned_t) -{ -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.wait.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 -// .sem = { .release } -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t, dot_aligned_t) -{ -// __sem == sem_release (due to parameter type constraint) -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.arrive.release.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 -// .sem = { .relaxed } -// .aligned = { .aligned } -// Marked volatile -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_relaxed_t, - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t, dot_aligned_t) -{ -// __sem == sem_relaxed (due to parameter type constraint) -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.arrive.relaxed.aligned;" : : :); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 -// .sem = { .acquire } -// .aligned = { .aligned } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::dot_aligned_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t, dot_aligned_t) -{ -// __sem == sem_acquire (due to parameter type constraint) -// __aligned == aligned (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - asm volatile("barrier.cluster.wait.acquire.aligned;" : : : "memory"); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -# endif -} -#endif // __cccl_ptx_isa >= 800 - -#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h deleted file mode 100644 index e8691178f14..00000000000 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h +++ /dev/null @@ -1,36 +0,0 @@ -// This file was automatically generated. Do not edit. - -#ifndef _CUDA_PTX_GENERATED_ELECT_SYNC_H_ -#define _CUDA_PTX_GENERATED_ELECT_SYNC_H_ - -/* -// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 -template -__device__ static inline bool elect_sync( - const uint32_t& membermask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool elect_sync(const _CUDA_VSTD::uint32_t& __membermask) -{ -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - _CUDA_VSTD::uint32_t __is_elected; - asm volatile( - "{\n\t .reg .pred P_OUT; \n\t" - "elect.sync _|P_OUT, %1;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__is_elected) - : "r"(__membermask) - :); - return static_cast(__is_elected); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); - return false; -# endif -} -#endif // __cccl_ptx_isa >= 800 - -#endif // _CUDA_PTX_GENERATED_ELECT_SYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h deleted file mode 100644 index f93c8a62157..00000000000 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h +++ /dev/null @@ -1,33 +0,0 @@ -// This file was automatically generated. Do not edit. - -#ifndef _CUDA_PTX_GENERATED_MAPA_H_ -#define _CUDA_PTX_GENERATED_MAPA_H_ - -/* -// mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 -// .space = { .shared::cluster } -template -__device__ static inline Tp* mapa( - cuda::ptx::space_cluster_t, - const Tp* addr, - uint32_t target_cta); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _Tp* mapa(space_cluster_t, const _Tp* __addr, _CUDA_VSTD::uint32_t __target_cta) -{ -// __space == space_cluster (due to parameter type constraint) -# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 - _CUDA_VSTD::uint32_t __dest; - asm("mapa.shared::cluster.u32 %0, %1, %2;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)), "r"(__target_cta) :); - return __from_ptr_dsmem<_Tp>(__dest); -# else - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mapa_is_not_supported_before_SM_90__(); - return __from_ptr_dsmem<_Tp>(0); -# endif -} -#endif // __cccl_ptx_isa >= 780 - -#endif // _CUDA_PTX_GENERATED_MAPA_H_ diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h deleted file mode 100644 index 6f5a022dbc8..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h +++ /dev/null @@ -1,61 +0,0 @@ -// This file was automatically generated. Do not edit. - -// We use a special strategy to force the generation of the PTX. This is mainly -// a fight against dead-code-elimination in the NVVM layer. -// -// The reason we need this strategy is because certain older versions of ptxas -// segfault when a non-sensical sequence of PTX is generated. So instead, we try -// to force the instantiation and compilation to PTX of all the overloads of the -// PTX wrapping functions. -// -// We do this by writing a function pointer of each overload to the kernel -// parameter `fn_ptr`. -// -// Because `fn_ptr` is possibly visible outside this translation unit, the -// compiler must compile all the functions which are stored. - -__global__ void test_barrier_cluster_aligned(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.aligned; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait.aligned; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.release.aligned; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.relaxed.aligned; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait.acquire.aligned; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h deleted file mode 100644 index 298225881d1..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h +++ /dev/null @@ -1,26 +0,0 @@ -// This file was automatically generated. Do not edit. - -// We use a special strategy to force the generation of the PTX. This is mainly -// a fight against dead-code-elimination in the NVVM layer. -// -// The reason we need this strategy is because certain older versions of ptxas -// segfault when a non-sensical sequence of PTX is generated. So instead, we try -// to force the instantiation and compilation to PTX of all the overloads of the -// PTX wrapping functions. -// -// We do this by writing a function pointer of each overload to the kernel -// parameter `fn_ptr`. -// -// Because `fn_ptr` is possibly visible outside this translation unit, the -// compiler must compile all the functions which are stored. - -__global__ void test_elect_sync(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // elect.sync _|is_elected, membermask; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::elect_sync));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h deleted file mode 100644 index 9160be1fe2d..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h +++ /dev/null @@ -1,27 +0,0 @@ -// This file was automatically generated. Do not edit. - -// We use a special strategy to force the generation of the PTX. This is mainly -// a fight against dead-code-elimination in the NVVM layer. -// -// The reason we need this strategy is because certain older versions of ptxas -// segfault when a non-sensical sequence of PTX is generated. So instead, we try -// to force the instantiation and compilation to PTX of all the overloads of the -// PTX wrapping functions. -// -// We do this by writing a function pointer of each overload to the kernel -// parameter `fn_ptr`. -// -// Because `fn_ptr` is possibly visible outside this translation unit, the -// compiler must compile all the functions which are stored. - -__global__ void test_mapa(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mapa.shared::cluster.u32 dest, addr, target_cta; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mapa));)); -#endif // __cccl_ptx_isa >= 780 -} From 15a011658172b1b63bfac8a96fb49fec6d6af92a Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Thu, 30 Jan 2025 03:03:11 -0600 Subject: [PATCH 17/33] Add dynamic CUB dispatch for merge_sort (#3525) * Add `dependent_launch` parameter to `TripleChevronFactory` * Add `ItemsPerTile()` method to `PolicyWrapper` * Add `MergeSortPolicyWrapper` * Add `KernelSource` and use `launcher_factory` to launch `merge_sort` kernels * Move the vsmem_helper to kernel source and read `BlockThreads` from there instead of the policy directly * Make `BlockThreads` templated on the policy type * Obtain `ItemsPerTile` from the kernel source through vsmem helper * Change vsmem indirection so that it is its own template parameter passed to `DispatchMergeSort` * Use `_CCCL_HOST_DEVICE` for RTC --- cub/cub/detail/launcher/cuda_runtime.cuh | 6 +- .../device/dispatch/dispatch_merge_sort.cuh | 214 +++++++++++------- .../device/dispatch/kernels/merge_sort.cuh | 22 ++ .../dispatch/tuning/tuning_merge_sort.cuh | 32 ++- cub/cub/util_device.cuh | 5 + 5 files changed, 189 insertions(+), 90 deletions(-) diff --git a/cub/cub/detail/launcher/cuda_runtime.cuh b/cub/cub/detail/launcher/cuda_runtime.cuh index 81ef450f424..f59c26d7fbb 100644 --- a/cub/cub/detail/launcher/cuda_runtime.cuh +++ b/cub/cub/detail/launcher/cuda_runtime.cuh @@ -21,10 +21,10 @@ namespace detail struct TripleChevronFactory { - CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron - operator()(dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream) const + CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron operator()( + dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream, bool dependent_launch = false) const { - return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream); + return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream, dependent_launch); } CUB_RUNTIME_FUNCTION cudaError_t PtxVersion(int& version) diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 056522e162d..98a4b40e8f8 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -46,7 +46,6 @@ #include #include -#include #include #include @@ -54,24 +53,89 @@ CUB_NAMESPACE_BEGIN -/******************************************************************************* - * Policy - ******************************************************************************/ - -template > + typename CompareOpT> +struct DeviceMergeSortKernelSource +{ + using KeyT = cub::detail::value_t; + using ValueT = cub::detail::value_t; + + CUB_DEFINE_KERNEL_GETTER( + MergeSortBlockSortKernel, + DeviceMergeSortBlockSortKernel< + MaxPolicyT, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + KeyT, + ValueT>); + + CUB_DEFINE_KERNEL_GETTER(MergeSortPartitionKernel, + DeviceMergeSortPartitionKernel); + + CUB_DEFINE_KERNEL_GETTER( + MergeSortMergeKernel, + DeviceMergeSortMergeKernel); +}; + +} // namespace detail::merge_sort + +/******************************************************************************* + * Policy + ******************************************************************************/ + +template < + typename KeyInputIteratorT, + typename ValueInputIteratorT, + typename KeyIteratorT, + typename ValueIteratorT, + typename OffsetT, + typename CompareOpT, + typename PolicyHub = detail::merge_sort::policy_hub, + typename KernelSource = detail::merge_sort::DeviceMergeSortKernelSource< + typename PolicyHub::MaxPolicy, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT>, + typename KernelLauncherFactory = detail::TripleChevronFactory, + typename VSMemHelperPolicyT = detail::merge_sort::merge_sort_vsmem_helper_t< + typename PolicyHub::MaxPolicy::MergeSortPolicy, + KeyInputIteratorT, + ValueInputIteratorT, + KeyIteratorT, + ValueIteratorT, + OffsetT, + CompareOpT, + cub::detail::value_t, + cub::detail::value_t>> struct DispatchMergeSort { using KeyT = cub::detail::value_t; using ValueT = cub::detail::value_t; /// Whether or not there are values to be trucked along with keys - static constexpr bool KEYS_ONLY = std::is_same::value; + static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v; // Problem state @@ -106,6 +170,12 @@ struct DispatchMergeSort int ptx_version; + KernelSource kernel_source; + + KernelLauncherFactory launcher_factory; + + VSMemHelperPolicyT vsmem_helper; + // Constructor CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchMergeSort( void* d_temp_storage, @@ -117,7 +187,10 @@ struct DispatchMergeSort OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, - int ptx_version) + int ptx_version, + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}, + VSMemHelperPolicyT vsmem_helper = {}) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input_keys(d_input_keys) @@ -128,28 +201,15 @@ struct DispatchMergeSort , compare_op(compare_op) , stream(stream) , ptx_version(ptx_version) + , kernel_source(kernel_source) + , launcher_factory(launcher_factory) + , vsmem_helper(vsmem_helper) {} // Invocation template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT policy = {}) { - using MergePolicyT = typename ActivePolicyT::MergeSortPolicy; - - using merge_sort_helper_t = detail::merge_sort::merge_sort_vsmem_helper_t< - MergePolicyT, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>; - - using BlockSortVSmemHelperT = detail::vsmem_helper_impl; - using MergeAgentVSmemHelperT = detail::vsmem_helper_impl; - cudaError error = cudaSuccess; if (num_items == 0) @@ -163,8 +223,9 @@ struct DispatchMergeSort do { - constexpr auto tile_size = merge_sort_helper_t::policy_t::ITEMS_PER_TILE; - const auto num_tiles = ::cuda::ceil_div(num_items, tile_size); + auto wrapped_policy = detail::merge_sort::MakeMergeSortPolicyWrapper(policy); + const auto tile_size = vsmem_helper.ItemsPerTile(wrapped_policy.MergeSort()); + const auto num_tiles = ::cuda::ceil_div(num_items, tile_size); const auto merge_partitions_size = static_cast(1 + num_tiles) * sizeof(OffsetT); const auto temporary_keys_storage_size = static_cast(num_items * sizeof(KeyT)); @@ -174,8 +235,8 @@ struct DispatchMergeSort * Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases, * merge sort allocates virtual shared memory that resides in global memory. */ - const std::size_t block_sort_smem_size = num_tiles * BlockSortVSmemHelperT::vsmem_per_block; - const std::size_t merge_smem_size = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block; + const std::size_t block_sort_smem_size = num_tiles * vsmem_helper.block_sort_vsmem_per_block(); + const std::size_t merge_smem_size = num_tiles * vsmem_helper.merge_vsmem_per_block(); const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size); void* allocations[4] = {nullptr, nullptr, nullptr, nullptr}; @@ -214,29 +275,19 @@ struct DispatchMergeSort auto items_buffer = static_cast(allocations[2]); // Invoke DeviceMergeSortBlockSortKernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - static_cast(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream, true) - .doit( - detail::merge_sort::DeviceMergeSortBlockSortKernel< - typename PolicyHub::MaxPolicy, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>, - ping, - d_input_keys, - d_input_items, - d_output_keys, - d_output_items, - num_items, - keys_buffer, - items_buffer, - compare_op, - cub::detail::vsmem_t{allocations[3]}); + launcher_factory( + static_cast(num_tiles), vsmem_helper.BlockThreads(wrapped_policy.MergeSort()), 0, stream, true) + .doit(kernel_source.MergeSortBlockSortKernel(), + ping, + d_input_keys, + d_input_items, + d_output_keys, + d_output_items, + num_items, + keys_buffer, + items_buffer, + compare_op, + cub::detail::vsmem_t{allocations[3]}); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) @@ -273,9 +324,8 @@ struct DispatchMergeSort const OffsetT target_merged_tiles_number = OffsetT(2) << pass; // Partition - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - partition_grid_size, threads_per_partition_block, 0, stream, true) - .doit(detail::merge_sort::DeviceMergeSortPartitionKernel, + launcher_factory(partition_grid_size, threads_per_partition_block, 0, stream, true) + .doit(kernel_source.MergeSortPartitionKernel(), ping, d_output_keys, keys_buffer, @@ -300,29 +350,19 @@ struct DispatchMergeSort } // Merge - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - static_cast(num_tiles), static_cast(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true) - .doit( - detail::merge_sort::DeviceMergeSortMergeKernel< - typename PolicyHub::MaxPolicy, - KeyInputIteratorT, - ValueInputIteratorT, - KeyIteratorT, - ValueIteratorT, - OffsetT, - CompareOpT, - KeyT, - ValueT>, - ping, - d_output_keys, - d_output_items, - num_items, - keys_buffer, - items_buffer, - compare_op, - merge_partitions, - target_merged_tiles_number, - cub::detail::vsmem_t{allocations[3]}); + launcher_factory( + static_cast(num_tiles), vsmem_helper.BlockThreads(wrapped_policy.MergeSort()), 0, stream, true) + .doit(kernel_source.MergeSortMergeKernel(), + ping, + d_output_keys, + d_output_items, + num_items, + keys_buffer, + items_buffer, + compare_op, + merge_partitions, + target_merged_tiles_number, + cub::detail::vsmem_t{allocations[3]}); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) @@ -342,6 +382,7 @@ struct DispatchMergeSort return error; } + template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -351,7 +392,11 @@ struct DispatchMergeSort ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, - cudaStream_t stream) + cudaStream_t stream, + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}, + MaxPolicyT max_policy = {}, + VSMemHelperPolicyT vsmem_helper = {}) { cudaError error = cudaSuccess; do @@ -375,10 +420,13 @@ struct DispatchMergeSort num_items, compare_op, stream, - ptx_version); + ptx_version, + kernel_source, + launcher_factory, + vsmem_helper); // Dispatch to chained policy - error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); + error = CubDebug(max_policy.Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh index 1065313c20d..c9a8a61395a 100644 --- a/cub/cub/device/dispatch/kernels/merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh @@ -116,6 +116,28 @@ public: using block_sort_agent_t = ::cuda::std::_If; using merge_agent_t = ::cuda::std::_If; + + _CCCL_HOST_DEVICE static constexpr ::cuda::std::size_t block_sort_vsmem_per_block() + { + return detail::vsmem_helper_impl::vsmem_per_block; + } + + _CCCL_HOST_DEVICE static constexpr ::cuda::std::size_t merge_vsmem_per_block() + { + return detail::vsmem_helper_impl::vsmem_per_block; + } + + template + _CCCL_HOST_DEVICE static constexpr int BlockThreads(PolicyT /*policy*/) + { + return policy_t::BLOCK_THREADS; + } + + template + _CCCL_HOST_DEVICE static constexpr int ItemsPerTile(PolicyT /*policy*/) + { + return policy_t::ITEMS_PER_TILE; + } }; template +struct MergeSortPolicyWrapper : PolicyT +{ + CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper(PolicyT base) + : PolicyT(base) + {} +}; + +template +struct MergeSortPolicyWrapper> + : StaticPolicyT +{ + CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper(StaticPolicyT base) + : StaticPolicyT(base) + {} + + CUB_DEFINE_SUB_POLICY_GETTER(MergeSort); +}; + +template +CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper MakeMergeSortPolicyWrapper(PolicyT policy) { + return MergeSortPolicyWrapper{policy}; +} + template struct policy_hub { @@ -88,8 +112,8 @@ struct policy_hub using MaxPolicy = Policy600; }; -} // namespace merge_sort -} // namespace detail + +} // namespace detail::merge_sort template using DeviceMergeSortPolicy CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and it will be " diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index fd356b8f9e5..ca365b531fc 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -553,6 +553,11 @@ struct PolicyWrapper< { return StaticPolicyT::ITEMS_PER_THREAD; } + + CUB_RUNTIME_FUNCTION static constexpr int ItemsPerTile() + { + return StaticPolicyT::ITEMS_PER_TILE; + } }; template From 5ce5d28f0572d34126e00f0765977d8c54391e8e Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 10:52:13 +0100 Subject: [PATCH 18/33] PTX: Update existing instructions (#3584) * mbarrier.expect_tx: Add missing source and test It was already documented(!) * cp.async.bulk.tensor: Add .{gather,scatter}4 * fence: Add .sync_restrict, .proxy.async.sync_restrict Co-authored-by: Allard Hendriksen --- .../ptx/instructions/cp_async_bulk_tensor.rst | 5 +++ docs/libcudacxx/ptx/instructions/fence.rst | 10 +++++ .../__ptx/instructions/cp_async_bulk_tensor.h | 1 + .../include/cuda/__ptx/instructions/fence.h | 2 + .../__ptx/instructions/mbarrier_expect_tx.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 1 + .../ptx.cp.async.bulk.tensor.compile.pass.cpp | 1 + .../cuda/ptx/ptx.fence.compile.pass.cpp | 2 + .../ptx.mbarrier.expect_tx.compile.pass.cpp | 22 +++++++++++ 9 files changed, 81 insertions(+) create mode 100644 libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst index bde3488bac9..8dc9def989b 100644 --- a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst @@ -21,3 +21,8 @@ Multicast --------- .. include:: generated/cp_async_bulk_tensor_multicast.rst + +Scatter / Gather +---------------- + +.. include:: generated/cp_async_bulk_tensor_gather_scatter.rst diff --git a/docs/libcudacxx/ptx/instructions/fence.rst b/docs/libcudacxx/ptx/instructions/fence.rst index 82de170f63b..4d9126be62f 100644 --- a/docs/libcudacxx/ptx/instructions/fence.rst +++ b/docs/libcudacxx/ptx/instructions/fence.rst @@ -13,6 +13,11 @@ fence .. include:: generated/fence.rst +fence.sync_restrict +------------------- + +.. include:: generated/fence_sync_restrict.rst + fence.mbarrier_init ------------------- @@ -29,6 +34,11 @@ fence.proxy.async .. include:: generated/fence_proxy_async.rst +fence.proxy.async.sync_restrict +------------------------------- + +.. include:: generated/fence_proxy_async_generic_sync_restrict.rst + fence.proxy.tensormap --------------------- diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h index 7de5b41b744..f99c0c6f73b 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h @@ -33,6 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor #include +#include #include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h index a8dccf979c2..3c123840797 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h @@ -36,7 +36,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX #include #include #include +#include #include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h new file mode 100644 index 00000000000..886bfe64d75 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_MBARRIER_EXPECT_TX_H_ +#define _CUDA_PTX_MBARRIER_EXPECT_TX_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_MBARRIER_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 44edb20c98e..4798973df77 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -80,6 +80,7 @@ #include #include #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp index 42bc5b8e355..efd66a8fa4e 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp @@ -17,6 +17,7 @@ #include "nvrtc_workaround.h" // above header needs to be included before the generated test header #include "generated/cp_async_bulk_tensor.h" +#include "generated/cp_async_bulk_tensor_gather_scatter.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp index c439720b8f8..aa2c9ec6152 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp @@ -20,7 +20,9 @@ #include "generated/fence_mbarrier_init.h" #include "generated/fence_proxy_alias.h" #include "generated/fence_proxy_async.h" +#include "generated/fence_proxy_async_generic_sync_restrict.h" #include "generated/fence_proxy_tensormap_generic.h" +#include "generated/fence_sync_restrict.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp new file mode 100644 index 00000000000..f4d06bdb8ba --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/mbarrier_expect_tx.h" + +int main(int, char**) +{ + return 0; +} From a1a73a8708eac531498762c22999d0a5aea076d0 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 11:26:03 +0100 Subject: [PATCH 19/33] Internalize cuda/detail/core/util.h (#3505) --- cub/cub/agent/agent_adjacent_difference.cuh | 4 +- cub/cub/agent/agent_merge.cuh | 8 +- cub/cub/agent/agent_merge_sort.cuh | 15 ++-- cub/cub/agent/agent_sub_warp_merge_sort.cuh | 4 +- cub/cub/device/dispatch/dispatch_merge.cuh | 2 +- .../device/dispatch/kernels/merge_sort.cuh | 19 ++--- .../system/cuda/detail/core/agent_launcher.h | 46 +++--------- .../system/cuda/detail/core/load_iterator.h | 4 +- .../cuda/detail/core/make_load_iterator.h | 4 +- thrust/thrust/system/cuda/detail/core/util.h | 51 +------------ thrust/thrust/system/cuda/detail/extrema.h | 18 ++--- thrust/thrust/system/cuda/detail/reduce.h | 40 +++++----- .../thrust/system/cuda/detail/reduce_by_key.h | 48 ++++++------ .../system/cuda/detail/set_operations.h | 73 +++++++++---------- thrust/thrust/system/cuda/detail/sort.h | 6 +- thrust/thrust/system/cuda/detail/unique.h | 43 +++++------ 16 files changed, 156 insertions(+), 229 deletions(-) diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh index c19cb90079a..8617c78193b 100644 --- a/cub/cub/agent/agent_adjacent_difference.cuh +++ b/cub/cub/agent/agent_adjacent_difference.cuh @@ -79,7 +79,7 @@ template struct AgentDifference { - using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using BlockLoad = typename cub::BlockLoadType::type; using BlockStore = typename cub::BlockStoreType::type; @@ -119,7 +119,7 @@ struct AgentDifference OffsetT num_items) : temp_storage(temp_storage.Alias()) , input_it(input_it) - , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it)) + , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(Policy(), input_it)) , first_tile_previous(first_tile_previous) , result(result) , difference_op(difference_op) diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh index 9ae14c3e42e..5c7d5322456 100644 --- a/cub/cub/agent/agent_merge.cuh +++ b/cub/cub/agent/agent_merge.cuh @@ -64,10 +64,10 @@ struct agent_t using key_type = typename ::cuda::std::iterator_traits::value_type; using item_type = typename ::cuda::std::iterator_traits::value_type; - using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using block_load_keys1 = typename BlockLoadType::type; using block_load_keys2 = typename BlockLoadType::type; diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh index bf4984f7256..1ec952187a7 100644 --- a/cub/cub/agent/agent_merge_sort.cuh +++ b/cub/cub/agent/agent_merge_sort.cuh @@ -91,8 +91,10 @@ struct AgentBlockSort using BlockMergeSortT = BlockMergeSort; - using KeysLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using KeysLoadIt = + typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadIt = + typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using BlockLoadKeys = typename cub::BlockLoadType::type; using BlockLoadItems = typename cub::BlockLoadType::type; @@ -438,10 +440,11 @@ struct AgentMerge //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- - using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadPingIt = + typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using KeysOutputPongIt = KeyIteratorT; using ItemsOutputPongIt = ValueIteratorT; diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh index b10f1cda3ea..9f98ac42e3b 100644 --- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh +++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh @@ -183,8 +183,8 @@ public: using WarpMergeSortT = WarpMergeSort; - using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using WarpLoadKeysT = cub::WarpLoad; using WarpLoadItemsT = diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index b3d0c8ab2ca..c4df61fd29a 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -138,7 +138,7 @@ __launch_bounds__( CompareOp>::type; using MergePolicy = typename MergeAgent::policy; - using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator; + using THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator; using vsmem_helper_t = vsmem_helper_impl; __shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage; auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage); diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh index c9a8a61395a..79f7c6bbe40 100644 --- a/cub/cub/device/dispatch/kernels/merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh @@ -19,12 +19,13 @@ THRUST_NAMESPACE_BEGIN -namespace cuda_cub::core +namespace cuda_cub::core::detail { // We must forward declare here because make_load_iterator.h pulls in non NVRTC compilable code template -typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it); -} // namespace cuda_cub::core +typename detail::LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE +make_load_iterator(PtxPlan const&, It it); +} // namespace cuda_cub::core::detail THRUST_NAMESPACE_END @@ -196,8 +197,8 @@ __launch_bounds__( AgentBlockSortT agent( ping, temp_storage, - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_in), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_in), keys_count, keys_out, items_out, @@ -302,10 +303,10 @@ __launch_bounds__( AgentMergeT agent( ping, temp_storage, - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_ping), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_ping), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_pong), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_pong), keys_count, keys_pong, items_pong, diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index fb7c1ef22d6..d9baeb47593 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -62,7 +62,8 @@ namespace cuda_cub { namespace core { - +namespace detail +{ # ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES # define THRUST_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES # endif @@ -97,7 +98,7 @@ THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*, Args... args) template struct AgentLauncher : Agent { - core::AgentPlan plan; + AgentPlan plan; size_t count; cudaStream_t stream; char const* name; @@ -121,7 +122,7 @@ struct AgentLauncher : Agent , name(name_) , grid(static_cast((count + plan.items_per_tile - 1) / plan.items_per_tile)) , vshmem(nullptr) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(count > 0); @@ -136,7 +137,7 @@ struct AgentLauncher : Agent , name(name_) , grid(static_cast((count + plan.items_per_tile - 1) / plan.items_per_tile)) , vshmem(vshmem) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(count > 0); @@ -149,7 +150,7 @@ struct AgentLauncher : Agent , name(name_) , grid(plan.grid_size) , vshmem(nullptr) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(plan.grid_size > 0); @@ -162,43 +163,19 @@ struct AgentLauncher : Agent , name(name_) , grid(plan.grid_size) , vshmem(vshmem) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(plan.grid_size > 0); } -# if 0 - THRUST_RUNTIME_FUNCTION - AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0) - { - // in separable compilation mode, we have no choice - // but to call kernel to get agent_plan - // otherwise the risk is something may fail - // if user mix & match ptx versions in a separably compiled function - // http://nvbugs/1772071 - // XXX may be it is too string of a requirements, consider relaxing it in - // the future -# ifdef __CUDACC_RDC__ - return core::get_agent_plan(s, d_ptr); -# else - return get_agent_plan(core::get_ptx_version()); -# endif - } - THRUST_RUNTIME_FUNCTION - AgentPlan static get_plan_default() - { - return get_agent_plan(sm_arch<0>::type::ver); - } -# endif - - THRUST_RUNTIME_FUNCTION typename core::get_plan::type static get_plan(cudaStream_t, void* d_ptr = 0) + THRUST_RUNTIME_FUNCTION typename get_plan::type static get_plan(cudaStream_t, void* d_ptr = 0) { THRUST_UNUSED_VAR(d_ptr); - return get_agent_plan(core::get_ptx_version()); + return get_agent_plan(get_ptx_version()); } - THRUST_RUNTIME_FUNCTION typename core::get_plan::type static get_plan() + THRUST_RUNTIME_FUNCTION typename detail::get_plan::type static get_plan() { return get_agent_plan(lowest_supported_sm_arch::ver); } @@ -227,7 +204,7 @@ struct AgentLauncher : Agent { # if THRUST_DEBUG_SYNC_FLAG cuda_optional occ = max_sm_occupancy(k); - const int ptx_version = core::get_ptx_version(); + const int ptx_version = get_ptx_version(); if (count > 0) { _CubLog( @@ -305,6 +282,7 @@ struct AgentLauncher : Agent } }; +} // namespace detail } // namespace core } // namespace cuda_cub diff --git a/thrust/thrust/system/cuda/detail/core/load_iterator.h b/thrust/thrust/system/cuda/detail/core/load_iterator.h index 07c5eba0eaa..6f2c118b151 100644 --- a/thrust/thrust/system/cuda/detail/core/load_iterator.h +++ b/thrust/thrust/system/cuda/detail/core/load_iterator.h @@ -34,7 +34,7 @@ THRUST_NAMESPACE_BEGIN -namespace cuda_cub::core +namespace cuda_cub::core::detail { // LoadIterator @@ -52,6 +52,6 @@ struct LoadIterator cub::CacheModifiedInputIterator, It>; }; // struct Iterator -} // namespace cuda_cub::core +} // namespace cuda_cub::core::detail THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h index 28c65c813ea..9497ccacca9 100644 --- a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h +++ b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h @@ -33,7 +33,7 @@ THRUST_NAMESPACE_BEGIN -namespace cuda_cub::core +namespace cuda_cub::core::detail { template typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE @@ -55,6 +55,6 @@ typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE make_loa return make_load_iterator_impl(it, typename is_contiguous_iterator::type()); } -} // namespace cuda_cub::core +} // namespace cuda_cub::core::detail THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index 94a7e750aeb..b3bdcf1f086 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -78,6 +78,8 @@ namespace core # endif #endif +namespace detail +{ /// Typelist - a container of types template struct typelist; @@ -458,22 +460,9 @@ THRUST_RUNTIME_FUNCTION inline size_t get_max_shared_memory_per_block() return static_cast(i32value); } -THRUST_RUNTIME_FUNCTION inline size_t virtual_shmem_size(size_t shmem_per_block) -{ - size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); - if (shmem_per_block > max_shmem_per_block) - { - return shmem_per_block; - } - else - { - return 0; - } -} - THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks) { - size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); + size_t max_shmem_per_block = get_max_shared_memory_per_block(); if (shmem_per_block > max_shmem_per_block) { return shmem_per_block * num_blocks; @@ -509,22 +498,6 @@ struct BlockLoad get_arch::type::ver>; }; -// BlockStore -// ----------- -// a helper metaprogram that returns type of a block loader -template ::value_type> -struct BlockStore -{ - using type = - cub::BlockStore::type::ver>; -}; - // cuda_optional // -------------- // used for function that return cudaError_t along with the result @@ -619,16 +592,6 @@ THRUST_RUNTIME_FUNCTION inline int get_ptx_version() return ptx_version; } -THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream) -{ - return cub::SyncStream(stream); -} - -inline void _CCCL_DEVICE sync_threadblock() -{ - __syncthreads(); -} - // Deprecated [Since 2.8] #define CUDA_CUB_RET_IF_FAIL(e) \ { \ @@ -719,11 +682,6 @@ struct uninitialized_array } }; -_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t align_to(size_t n, size_t align) -{ - return ((n + align - 1) / align) * align; -} - namespace host { inline cuda_optional get_max_shared_memory_per_block() @@ -753,9 +711,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage( return cub::AliasTemporaries(storage_ptr, storage_size, allocations, allocation_sizes); } +} // namespace detail } // namespace core -using core::sm52; -using core::sm60; } // namespace cuda_cub THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h index 617eb8bbc79..b2124323424 100644 --- a/thrust/thrust/system/cuda/detail/extrema.h +++ b/thrust/thrust/system/cuda/detail/extrema.h @@ -184,10 +184,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( OutputIt output_it, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; - using core::cuda_optional; - using core::get_agent_plan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; + using core::detail::cuda_optional; + using core::detail::get_agent_plan; using UnsignedSize = typename detail::make_unsigned_special::type; @@ -204,7 +204,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( if (num_items <= reduce_plan.items_per_tile) { - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1); // small, single tile size if (d_temp_storage == nullptr) @@ -221,7 +221,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( else { // regular size - cuda_optional sm_count = core::get_sm_count(); + cuda_optional sm_count = core::detail::get_sm_count(); CUDA_CUB_RET_IF_FAIL(sm_count.status()); // reduction will not use more cta counts than requested @@ -245,7 +245,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( // we will launch at most "max_blocks" blocks in a grid // so preallocate virtual shared memory storage for this if required // - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks); // Temporary storage allocation requirements void* allocations[3] = {nullptr, nullptr, nullptr}; @@ -331,14 +331,14 @@ extrema(execution_policy& policy, InputIt first, Size num_items, Binary void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage"); T* d_result = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h index 3787ab62367..61ec2086adf 100644 --- a/thrust/thrust/system/cuda/detail/reduce.h +++ b/thrust/thrust/system/cuda/detail/reduce.h @@ -109,7 +109,7 @@ template struct Tuning; template -struct Tuning +struct Tuning { enum { @@ -155,7 +155,7 @@ struct ReduceAgent using tuning = Tuning; using Vector = typename cub::CubVector; - using LoadIt = typename core::LoadIterator::type; + using LoadIt = typename core::detail::LoadIterator::type; using BlockReduce = cub::BlockReduce; using VectorLoadIt = cub::CacheModifiedInputIterator; @@ -175,7 +175,7 @@ struct ReduceAgent // Other algorithms, e.g. merge, may not need additional information, // and may use AgentPlan directly, instead of defining their own Plan type. // - struct Plan : core::AgentPlan + struct Plan : core::detail::AgentPlan { cub::GridMappingStrategy grid_mapping; @@ -183,7 +183,7 @@ struct ReduceAgent template THRUST_RUNTIME_FUNCTION Plan(P) - : core::AgentPlan(P()) + : core::detail::AgentPlan(P()) , grid_mapping(P::GRID_MAPPING) {} }; @@ -192,7 +192,7 @@ struct ReduceAgent // ptx_plan type *must* only be used from device code // Its use from host code will result in *undefined behaviour* // - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using TempStorage = typename ptx_plan::TempStorage; using Vector = typename ptx_plan::Vector; @@ -230,7 +230,7 @@ struct ReduceAgent THRUST_DEVICE_FUNCTION impl(TempStorage& storage_, InputIt input_it_, ReductionOp reduction_op_) : storage(storage_) , input_it(input_it_) - , load_it(core::make_load_iterator(ptx_plan(), input_it)) + , load_it(core::detail::make_load_iterator(ptx_plan(), input_it)) , reduction_op(reduction_op_) {} @@ -428,8 +428,6 @@ struct ReduceAgent THRUST_DEVICE_FUNCTION T consume_tiles_impl(Size num_items, cub::GridQueue queue, CAN_VECTORIZE can_vectorize) { - using core::sync_threadblock; - // We give each thread block at least one tile of input. T thread_aggregate; Size block_offset = blockIdx.x * ITEMS_PER_TILE; @@ -454,7 +452,7 @@ struct ReduceAgent storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base; } - sync_threadblock(); + __syncthreads(); // Grab tile offset and check if we're done with full tiles block_offset = storage.dequeue_offset; @@ -465,7 +463,7 @@ struct ReduceAgent consume_tile( thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize); - sync_threadblock(); + __syncthreads(); // Dequeue a tile of items if (threadIdx.x == 0) @@ -473,7 +471,7 @@ struct ReduceAgent storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base; } - sync_threadblock(); + __syncthreads(); // Grab tile offset and check if we're done with full tiles block_offset = storage.dequeue_offset; @@ -586,7 +584,7 @@ struct DrainAgent template struct PtxPlan : PtxPolicy<1> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -609,10 +607,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( OutputIt output_it, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; - using core::cuda_optional; - using core::get_agent_plan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; + using core::detail::cuda_optional; + using core::detail::get_agent_plan; using UnsignedSize = typename detail::make_unsigned_special::type; @@ -629,7 +627,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( if (num_items <= reduce_plan.items_per_tile) { - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1); // small, single tile size if (d_temp_storage == nullptr) @@ -646,7 +644,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( else { // regular size - cuda_optional sm_count = core::get_sm_count(); + cuda_optional sm_count = core::detail::get_sm_count(); CUDA_CUB_RET_IF_FAIL(sm_count.status()); // reduction will not use more cta counts than requested @@ -670,7 +668,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( // we will launch at most "max_blocks" blocks in a grid // so preallocate virtual shared memory storage for this if required // - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks); // Temporary storage allocation requirements void* allocations[3] = {nullptr, nullptr, nullptr}; @@ -755,14 +753,14 @@ reduce(execution_policy& policy, InputIt first, Size num_items, T init, void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage"); T* d_result = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h index ae1f0ffab96..8c1db436085 100644 --- a/thrust/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h @@ -115,7 +115,7 @@ template struct Tuning; template -struct Tuning +struct Tuning { enum { @@ -163,11 +163,11 @@ struct ReduceByKeyAgent { using tuning = Tuning; - using KeysLoadIt = typename core::LoadIterator::type; - using ValuesLoadIt = typename core::LoadIterator::type; + using KeysLoadIt = typename core::detail::LoadIterator::type; + using ValuesLoadIt = typename core::detail::LoadIterator::type; - using BlockLoadKeys = typename core::BlockLoad::type; - using BlockLoadValues = typename core::BlockLoad::type; + using BlockLoadKeys = typename core::detail::BlockLoad::type; + using BlockLoadValues = typename core::detail::BlockLoad::type; using BlockDiscontinuityKeys = cub::BlockDiscontinuity; @@ -188,11 +188,11 @@ struct ReduceByKeyAgent typename BlockLoadKeys::TempStorage load_keys; typename BlockLoadValues::TempStorage load_values; - core::uninitialized_array raw_exchange; + core::detail::uninitialized_array raw_exchange; }; // union TempStorage }; // struct PtxPlan - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using KeysLoadIt = typename ptx_plan::KeysLoadIt; using ValuesLoadIt = typename ptx_plan::ValuesLoadIt; @@ -360,9 +360,7 @@ struct ReduceByKeyAgent size_type num_tile_segments, size_type num_tile_segments_prefix) { - using core::sync_threadblock; - - sync_threadblock(); + __syncthreads(); // Compact and scatter keys # pragma unroll @@ -375,7 +373,7 @@ struct ReduceByKeyAgent } } - sync_threadblock(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) { @@ -445,8 +443,6 @@ struct ReduceByKeyAgent template THRUST_DEVICE_FUNCTION void consume_first_tile(Size num_remaining, Size tile_offset, ScanTileState& tile_state) { - using core::sync_threadblock; - key_type keys[ITEMS_PER_THREAD]; // Tile keys key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor) value_type values[ITEMS_PER_THREAD]; // Tile values @@ -468,7 +464,7 @@ struct ReduceByKeyAgent BlockLoadKeys(storage.load_keys).Load(keys_load_it + tile_offset, keys); } - sync_threadblock(); + __syncthreads(); // Load values (last tile repeats final element) if (IS_LAST_TILE) @@ -481,7 +477,7 @@ struct ReduceByKeyAgent BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values); } - sync_threadblock(); + __syncthreads(); // Set head segment_flags. // First tile sets the first flag for the first item @@ -540,8 +536,6 @@ struct ReduceByKeyAgent THRUST_DEVICE_FUNCTION void consume_subsequent_tile(Size num_remaining, int tile_idx, Size tile_offset, ScanTileState& tile_state) { - using core::sync_threadblock; - key_type keys[ITEMS_PER_THREAD]; // Tile keys key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor) value_type values[ITEMS_PER_THREAD]; // Tile values @@ -563,7 +557,7 @@ struct ReduceByKeyAgent key_type tile_pred_key = (threadIdx.x == 0) ? key_type(keys_load_it[tile_offset - 1]) : key_type(); - sync_threadblock(); + __syncthreads(); // Load values (last tile repeats final element) if (IS_LAST_TILE) @@ -576,7 +570,7 @@ struct ReduceByKeyAgent BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values); } - sync_threadblock(); + __syncthreads(); // Set head segment_flags BlockDiscontinuityKeys(storage.scan_storage.discontinuity) @@ -635,8 +629,8 @@ struct ReduceByKeyAgent int /*num_tiles*/, ScanTileState& tile_state) : storage(storage_) - , keys_load_it(core::make_load_iterator(ptx_plan(), keys_input_it_)) - , values_load_it(core::make_load_iterator(ptx_plan(), values_input_it_)) + , keys_load_it(core::detail::make_load_iterator(ptx_plan(), keys_input_it_)) + , values_load_it(core::detail::make_load_iterator(ptx_plan(), values_input_it_)) , keys_output_it(keys_output_it_) , values_output_it(values_output_it_) , num_runs_output_it(num_runs_output_it_) @@ -703,7 +697,7 @@ struct InitAgent template struct PtxPlan : PtxPolicy<128> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -740,8 +734,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step( Size num_items, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; cudaError_t status = cudaSuccess; if (num_items == 0) @@ -762,7 +756,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step( int tile_size = reduce_by_key_plan.items_per_tile; Size num_tiles = ::cuda::ceil_div(num_items, tile_size); - size_t vshmem_size = core::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles); + size_t vshmem_size = core::detail::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles); size_t allocation_sizes[2] = {9, vshmem_size}; status = ScanTileState::AllocationSize(static_cast(num_tiles), allocation_sizes[0]); @@ -848,14 +842,14 @@ THRUST_RUNTIME_FUNCTION pair reduce_by_key_dispatc void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage"); Size* d_num_runs_out = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h index 7a267080bf8..b336f8e55fa 100644 --- a/thrust/thrust/system/cuda/detail/set_operations.h +++ b/thrust/thrust/system/cuda/detail/set_operations.h @@ -222,7 +222,7 @@ struct Tuning; namespace mpl = thrust::detail::mpl::math; template -struct Tuning +struct Tuning { enum { @@ -243,7 +243,7 @@ struct Tuning }; // tuning sm52 template -struct Tuning +struct Tuning { enum { @@ -290,15 +290,15 @@ struct SetOpAgent { using tuning = Tuning; - using KeysLoadIt1 = typename core::LoadIterator::type; - using KeysLoadIt2 = typename core::LoadIterator::type; - using ValuesLoadIt1 = typename core::LoadIterator::type; - using ValuesLoadIt2 = typename core::LoadIterator::type; + using KeysLoadIt1 = typename core::detail::LoadIterator::type; + using KeysLoadIt2 = typename core::detail::LoadIterator::type; + using ValuesLoadIt1 = typename core::detail::LoadIterator::type; + using ValuesLoadIt2 = typename core::detail::LoadIterator::type; - using BlockLoadKeys1 = typename core::BlockLoad::type; - using BlockLoadKeys2 = typename core::BlockLoad::type; - using BlockLoadValues1 = typename core::BlockLoad::type; - using BlockLoadValues2 = typename core::BlockLoad::type; + using BlockLoadKeys1 = typename core::detail::BlockLoad::type; + using BlockLoadKeys2 = typename core::detail::BlockLoad::type; + using BlockLoadValues1 = typename core::detail::BlockLoad::type; + using BlockLoadValues2 = typename core::detail::BlockLoad::type; using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState, Arch::ver>; @@ -316,7 +316,7 @@ struct SetOpAgent struct LoadStorage { - core::uninitialized_array offset; + core::detail::uninitialized_array offset; union { // FIXME These don't appear to be used anywhere? @@ -328,15 +328,15 @@ struct SetOpAgent // Allocate extra shmem than truly necessary // This will permit to avoid range checks in // serial set operations, e.g. serial_set_difference - core::uninitialized_array keys_shared; + core::detail::uninitialized_array keys_shared; - core::uninitialized_array values_shared; + core::detail::uninitialized_array values_shared; }; // anon union } load_storage; // struct LoadStorage }; // union TempStorage }; // struct PtxPlan - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using KeysLoadIt1 = typename ptx_plan::KeysLoadIt1; using KeysLoadIt2 = typename ptx_plan::KeysLoadIt2; @@ -441,8 +441,6 @@ struct SetOpAgent Size tile_output_prefix, int tile_output_count) { - using core::sync_threadblock; - int local_scatter_idx = thread_output_prefix - tile_output_prefix; # pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) @@ -452,7 +450,7 @@ struct SetOpAgent shared[local_scatter_idx++] = input[ITEM]; } } - sync_threadblock(); + __syncthreads(); for (int item = threadIdx.x; item < tile_output_count; item += BLOCK_THREADS) { @@ -483,8 +481,7 @@ struct SetOpAgent template void THRUST_DEVICE_FUNCTION consume_tile(Size tile_idx) { - using core::sync_threadblock; - using core::uninitialized_array; + using core::detail::uninitialized_array; pair partition_beg = partitions[tile_idx + 0]; pair partition_end = partitions[tile_idx + 1]; @@ -506,7 +503,7 @@ struct SetOpAgent reg_to_shared(&storage.load_storage.keys_shared[0], keys_loc); - sync_threadblock(); + __syncthreads(); int diag_loc = min(ITEMS_PER_THREAD * threadIdx.x, num_keys1 + num_keys2); @@ -529,7 +526,7 @@ struct SetOpAgent int dst = threadIdx.x == 0 ? BLOCK_THREADS - 1 : threadIdx.x - 1; storage.load_storage.offset[dst] = value; - core::sync_threadblock(); + __syncthreads(); pair partition1_loc = thrust::make_pair( storage.load_storage.offset[threadIdx.x] >> 16, storage.load_storage.offset[threadIdx.x] & 0xFFFF); @@ -554,7 +551,7 @@ struct SetOpAgent indices, compare_op, set_op); - sync_threadblock(); + __syncthreads(); # if 0 if (ITEMS_PER_THREAD*threadIdx.x >= num_keys1 + num_keys2) active_mask = 0; @@ -588,7 +585,7 @@ struct SetOpAgent tile_output_prefix = prefix_cb.GetExclusivePrefix(); } - sync_threadblock(); + __syncthreads(); // scatter results // @@ -605,11 +602,11 @@ struct SetOpAgent value_type values_loc[ITEMS_PER_THREAD]; gmem_to_reg(values_loc, values1_in + keys1_beg, values2_in + keys2_beg, num_keys1, num_keys2); - sync_threadblock(); + __syncthreads(); reg_to_shared(&storage.load_storage.values_shared[0], values_loc); - sync_threadblock(); + __syncthreads(); // gather items from shared mem // @@ -622,7 +619,7 @@ struct SetOpAgent } } - sync_threadblock(); + __syncthreads(); scatter(values_out, values_loc, @@ -660,10 +657,10 @@ struct SetOpAgent std::size_t* output_count_) : storage(storage_) , tile_state(tile_state_) - , keys1_in(core::make_load_iterator(ptx_plan(), keys1_)) - , keys2_in(core::make_load_iterator(ptx_plan(), keys2_)) - , values1_in(core::make_load_iterator(ptx_plan(), values1_)) - , values2_in(core::make_load_iterator(ptx_plan(), values2_)) + , keys1_in(core::detail::make_load_iterator(ptx_plan(), keys1_)) + , keys2_in(core::detail::make_load_iterator(ptx_plan(), keys2_)) + , values1_in(core::detail::make_load_iterator(ptx_plan(), values1_)) + , values2_in(core::detail::make_load_iterator(ptx_plan(), values2_)) , keys1_count(keys1_count_) , keys2_count(keys2_count_) , keys_out(keys_out_) @@ -733,7 +730,7 @@ struct PartitionAgent struct PtxPlan : PtxPolicy<256> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -767,7 +764,7 @@ struct InitAgent struct PtxPlan : PtxPolicy<128> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -1058,8 +1055,8 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( cudaError_t status = cudaSuccess; - using core::AgentLauncher; - using core::AgentPlan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; using set_op_agent = AgentLauncher< SetOpAgent>; @@ -1080,13 +1077,13 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( status = ScanTileState::AllocationSize(static_cast(num_tiles), tile_agent_storage); CUDA_CUB_RET_IF_FAIL(status); - size_t vshmem_storage = core::vshmem_size(set_op_plan.shared_memory_size, num_tiles); + size_t vshmem_storage = core::detail::vshmem_size(set_op_plan.shared_memory_size, num_tiles); size_t partition_agent_storage = (num_tiles + 1) * sizeof(Size) * 2; void* allocations[3] = {nullptr, nullptr, nullptr}; size_t allocation_sizes[3] = {tile_agent_storage, partition_agent_storage, vshmem_storage}; - status = core::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes); CUDA_CUB_RET_IF_FAIL(status); if (d_temp_storage == nullptr) @@ -1192,14 +1189,14 @@ THRUST_RUNTIME_FUNCTION pair set_operations( size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "set_operations failed on 1st alias_storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage"); std::size_t* d_output_count = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h index 2c3ef85202d..7ad67fd4e0c 100644 --- a/thrust/thrust/system/cuda/detail/sort.h +++ b/thrust/thrust/system/cuda/detail/sort.h @@ -58,6 +58,8 @@ # include # include +# include + # include # if defined(_CCCL_HAS_NVFP16) @@ -277,8 +279,8 @@ THRUST_RUNTIME_FUNCTION void radix_sort(execution_policy& policy, Key* dispatch::doit(nullptr, temp_storage_bytes, keys_buffer, items_buffer, keys_count, stream); cuda_cub::throw_on_error(status, "radix_sort: failed on 1st step"); - size_t keys_temp_storage = core::align_to(sizeof(Key) * keys_count, 128); - size_t items_temp_storage = core::align_to(sizeof(Item) * items_count, 128); + size_t keys_temp_storage = ::cuda::round_up(sizeof(Key) * keys_count, 128); + size_t items_temp_storage = ::cuda::round_up(sizeof(Item) * items_count, 128); size_t storage_size = keys_temp_storage + items_temp_storage + temp_storage_bytes; diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h index ac94017758b..1d39b161866 100644 --- a/thrust/thrust/system/cuda/detail/unique.h +++ b/thrust/thrust/system/cuda/detail/unique.h @@ -123,7 +123,7 @@ struct items_per_thread }; template -struct Tuning +struct Tuning { const static int INPUT_SIZE = sizeof(T); enum @@ -149,16 +149,16 @@ struct UniqueAgent { using tuning = Tuning; - using ItemsLoadIt = typename core::LoadIterator::type; + using ItemsLoadIt = typename core::detail::LoadIterator::type; - using BlockLoadItems = typename core::BlockLoad::type; + using BlockLoadItems = typename core::detail::BlockLoad::type; using BlockDiscontinuityItems = cub::BlockDiscontinuity; using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState, Arch::ver>; using BlockScan = cub::BlockScan; - using shared_items_t = core::uninitialized_array; + using shared_items_t = core::detail::uninitialized_array; union TempStorage { @@ -175,7 +175,7 @@ struct UniqueAgent }; // union TempStorage }; // struct PtxPlan - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using ItemsLoadIt = typename ptx_plan::ItemsLoadIt; using BlockLoadItems = typename ptx_plan::BlockLoadItems; @@ -224,8 +224,6 @@ struct UniqueAgent Size num_selections_prefix, Size /*num_selections*/) { - using core::sync_threadblock; - # pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { @@ -236,14 +234,14 @@ struct UniqueAgent } } - sync_threadblock(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { items_out[num_selections_prefix + item] = get_shared()[item]; } - sync_threadblock(); + __syncthreads(); } //--------------------------------------------------------------------- @@ -253,8 +251,7 @@ struct UniqueAgent template Size THRUST_DEVICE_FUNCTION consume_tile_impl(int num_tile_items, int tile_idx, Size tile_base) { - using core::sync_threadblock; - using core::uninitialized_array; + using core::detail::uninitialized_array; item_type items_loc[ITEMS_PER_THREAD]; Size selection_flags[ITEMS_PER_THREAD]; @@ -270,7 +267,7 @@ struct UniqueAgent BlockLoadItems(temp_storage.load_items).Load(items_in + tile_base, items_loc); } - sync_threadblock(); + __syncthreads(); if (IS_FIRST_TILE) { @@ -294,7 +291,7 @@ struct UniqueAgent } } - sync_threadblock(); + __syncthreads(); Size num_tile_selections = 0; Size num_selections = 0; @@ -337,7 +334,7 @@ struct UniqueAgent } } - sync_threadblock(); + __syncthreads(); scatter(items_loc, selection_flags, @@ -420,7 +417,7 @@ struct UniqueAgent impl(storage, tile_state, - core::make_load_iterator(ptx_plan(), items_in), + core::detail::make_load_iterator(ptx_plan(), items_in), items_out, binary_pred, num_items, @@ -435,7 +432,7 @@ struct InitAgent template struct PtxPlan : PtxPolicy<128> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -463,9 +460,9 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step( Size num_items, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; - using core::get_agent_plan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; + using core::detail::get_agent_plan; using unique_agent = AgentLauncher>; @@ -473,14 +470,14 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step( using init_agent = AgentLauncher>; - using core::get_plan; + using core::detail::get_plan; typename get_plan::type init_plan = init_agent::get_plan(); typename get_plan::type unique_plan = unique_agent::get_plan(stream); int tile_size = unique_plan.items_per_tile; size_t num_tiles = ::cuda::ceil_div(num_items, tile_size); - size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size, num_tiles); + size_t vshmem_size = core::detail::vshmem_size(unique_plan.shared_memory_size, num_tiles); cudaError_t status = cudaSuccess; size_t allocation_sizes[2] = {0, vshmem_size}; @@ -550,14 +547,14 @@ THRUST_RUNTIME_FUNCTION ItemsOutputIt unique( void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "unique: failed on 1st step"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "unique: failed on 2nd step"); size_type* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[0]); From 9a27ba3ba2da14dd9b8bd22c04ea057d9a7f493b Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 13:07:45 +0100 Subject: [PATCH 20/33] PTX: Add clusterlaunchcontrol (#3589) Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 1 + .../ptx/instructions/clusterlaunchcontrol.rst | 11 ++++++ .../__ptx/instructions/clusterlaunchcontrol.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 1 + .../ptx.clusterlaunchcontrol.compile.pass.cpp | 22 +++++++++++ 5 files changed, 72 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index f0776974eec..32db843c28d 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -7,6 +7,7 @@ PTX Instructions :maxdepth: 1 instructions/barrier_cluster + instructions/clusterlaunchcontrol instructions/cp_async_bulk instructions/cp_async_bulk_commit_group instructions/cp_async_bulk_wait_group diff --git a/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst b/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst new file mode 100644 index 00000000000..75fe44f6f22 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst @@ -0,0 +1,11 @@ +.. _libcudacxx-ptx-instructions-clusterlaunchcontrol: + +clusterlaunchcontrol +==================== + +- PTX ISA: + `clusterlaunchcontrol.try_cancel `__ +- PTX ISA: + `clusterlaunchcontrol.query_cancel `__ + +.. include:: generated/clusterlaunchcontrol.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h b/libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h new file mode 100644 index 00000000000..b15cfddf4a0 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_CLUSTERLAUNCHCONTROL_H_ +#define _CUDA_PTX_CLUSTERLAUNCHCONTROL_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_CLUSTERLAUNCHCONTROL_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 4798973df77..7087dd97d2a 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -70,6 +70,7 @@ */ #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp new file mode 100644 index 00000000000..212414c4535 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/clusterlaunchcontrol.h" + +int main(int, char**) +{ + return 0; +} From b1f2e63dafcb8d1379819e80375b1cd33393f449 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 13:10:21 +0100 Subject: [PATCH 21/33] PTX: Add st.bulk (#3604) Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 1 + docs/libcudacxx/ptx/instructions/st_bulk.rst | 9 +++++ .../include/cuda/__ptx/instructions/st_bulk.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 1 + .../cuda/ptx/ptx.st.bulk.compile.pass.cpp | 22 +++++++++++ 5 files changed, 70 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/st_bulk.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/st_bulk.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index 32db843c28d..ebf6e31f716 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -24,6 +24,7 @@ PTX Instructions instructions/mbarrier_try_wait instructions/red_async instructions/st_async + instructions/st_bulk instructions/tensormap_replace instructions/tensormap_cp_fenceproxy instructions/special_registers diff --git a/docs/libcudacxx/ptx/instructions/st_bulk.rst b/docs/libcudacxx/ptx/instructions/st_bulk.rst new file mode 100644 index 00000000000..64886598909 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/st_bulk.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-st-bulk: + +st.bulk +======= + +- PTX ISA: + `st.bulk `__ + +.. include:: generated/st_bulk.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/st_bulk.h new file mode 100644 index 00000000000..686e0ecf166 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/st_bulk.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_ST_BULK_H_ +#define _CUDA_PTX_ST_BULK_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_ST_BULK_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 7087dd97d2a..db9e70ab7e6 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -86,6 +86,7 @@ #include #include #include +#include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp new file mode 100644 index 00000000000..951e1a9f513 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/st_bulk.h" + +int main(int, char**) +{ + return 0; +} From afa2ca25d00fc9bd8037b3b2ca064f2c18708bfc Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 13:10:35 +0100 Subject: [PATCH 22/33] PTX: Add multimem instructions (#3603) * Add multimem.ld_reduce * Add multimem.red * Add multimem.st Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 3 ++ .../ptx/instructions/multimem_ld_reduce.rst | 9 +++++ .../ptx/instructions/multimem_red.rst | 9 +++++ .../ptx/instructions/multimem_st.rst | 9 +++++ .../__ptx/instructions/multimem_ld_reduce.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/multimem_red.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/multimem_st.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 3 ++ .../ptx.multimem.ld_reduce.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.multimem.red.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.multimem.st.compile.pass.cpp | 22 +++++++++++ 11 files changed, 210 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst create mode 100644 docs/libcudacxx/ptx/instructions/multimem_red.rst create mode 100644 docs/libcudacxx/ptx/instructions/multimem_st.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/multimem_red.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/multimem_st.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index ebf6e31f716..797e26d9911 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -22,6 +22,9 @@ PTX Instructions instructions/mbarrier_expect_tx instructions/mbarrier_test_wait instructions/mbarrier_try_wait + instructions/multimem_ld_reduce + instructions/multimem_red + instructions/multimem_st instructions/red_async instructions/st_async instructions/st_bulk diff --git a/docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst b/docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst new file mode 100644 index 00000000000..e9f5212131b --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-multimem-ld_reduce: + +multimem.ld_reduce +================== + +- PTX ISA: + `multimem.ld_reduce `__ + +.. include:: generated/multimem_ld_reduce.rst diff --git a/docs/libcudacxx/ptx/instructions/multimem_red.rst b/docs/libcudacxx/ptx/instructions/multimem_red.rst new file mode 100644 index 00000000000..0a6511b78d1 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/multimem_red.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-multimem-red: + +multimem.red +============ + +- PTX ISA: + `multimem.red `__ + +.. include:: generated/multimem_red.rst diff --git a/docs/libcudacxx/ptx/instructions/multimem_st.rst b/docs/libcudacxx/ptx/instructions/multimem_st.rst new file mode 100644 index 00000000000..75197f440c6 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/multimem_st.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-multimem-st: + +multimem.st +=========== + +- PTX ISA: + `multimem.st `__ + +.. include:: generated/multimem_st.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h b/libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h new file mode 100644 index 00000000000..29081e6107e --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_MULTIMEM_LD_REDUCE_H_ +#define _CUDA_PTX_MULTIMEM_LD_REDUCE_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_MULTIMEM_LD_REDUCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/multimem_red.h b/libcudacxx/include/cuda/__ptx/instructions/multimem_red.h new file mode 100644 index 00000000000..f0fc4e4d0e5 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/multimem_red.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_MULTIMEM_RED_H_ +#define _CUDA_PTX_MULTIMEM_RED_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_MULTIMEM_RED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/multimem_st.h b/libcudacxx/include/cuda/__ptx/instructions/multimem_st.h new file mode 100644 index 00000000000..608402f0131 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/multimem_st.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_MULTIMEM_ST_H_ +#define _CUDA_PTX_MULTIMEM_ST_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_MULTIMEM_ST_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index db9e70ab7e6..d11659ac6fb 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -84,6 +84,9 @@ #include #include #include +#include +#include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp new file mode 100644 index 00000000000..cbe0ba81971 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/multimem_ld_reduce.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp new file mode 100644 index 00000000000..b4aefa3b338 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/multimem_red.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp new file mode 100644 index 00000000000..4998c854382 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/multimem_st.h" + +int main(int, char**) +{ + return 0; +} From 0f52dd50c8a049372dfba62950f490813c2217ea Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 13:38:22 +0100 Subject: [PATCH 23/33] PTX: Add cp.async.mbarrier.arrive{.noinc} (#3602) Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 1 + .../instructions/cp_async_mbarrier_arrive.rst | 10 +++++ .../instructions/cp_async_mbarrier_arrive.h | 38 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 1 + ....cp.async.mbarrier.arrive.compile.pass.cpp | 23 +++++++++++ 5 files changed, 73 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index 797e26d9911..87ccc82b5b1 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -12,6 +12,7 @@ PTX Instructions instructions/cp_async_bulk_commit_group instructions/cp_async_bulk_wait_group instructions/cp_async_bulk_tensor + instructions/cp_async_mbarrier_arrive instructions/cp_reduce_async_bulk instructions/cp_reduce_async_bulk_tensor instructions/fence diff --git a/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst new file mode 100644 index 00000000000..f2ff2ff5ee7 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst @@ -0,0 +1,10 @@ +.. _libcudacxx-ptx-instructions-cp-async-mbarrier-arrive: + +cp.async.mbarrier.arrive +======================== + +- PTX ISA: + `cp.async.mbarrier.arrive `__ + +.. include:: generated/cp_async_mbarrier_arrive.rst +.. include:: generated/cp_async_mbarrier_arrive_noinc.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h new file mode 100644 index 00000000000..c19a09e2922 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h @@ -0,0 +1,38 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_CP_ASYNC_MBARRIER_ARRIVE_H_ +#define _CUDA_PTX_CP_ASYNC_MBARRIER_ARRIVE_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_CP_ASYNC_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index d11659ac6fb..0d699b2e2ca 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp new file mode 100644 index 00000000000..97623078198 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/cp_async_mbarrier_arrive.h" +#include "generated/cp_async_mbarrier_arrive_noinc.h" + +int main(int, char**) +{ + return 0; +} From 38983ebc42de5683e212562c931aa0789c6eefe7 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 16:40:27 +0100 Subject: [PATCH 24/33] PTX: Add tcgen05 instructions (#3607) * ptx: Add tcgen05.alloc * ptx: Add tcgen05.commit * ptx: Add tcgen05.cp * ptx: Add tcgen05.fence * ptx: Add tcgen05.ld * ptx: Add tcgen05.mma * ptx: Add tcgen05.mma.ws * ptx: Add tcgen05.shift * ptx: Add tcgen05.st * ptx: Add tcgen05.wait * fix docs --------- Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 10 +++++ .../ptx/instructions/tcgen05_alloc.rst | 9 +++++ .../ptx/instructions/tcgen05_commit.rst | 9 +++++ .../ptx/instructions/tcgen05_cp.rst | 9 +++++ .../ptx/instructions/tcgen05_fence.rst | 9 +++++ .../ptx/instructions/tcgen05_ld.rst | 9 +++++ .../ptx/instructions/tcgen05_mma.rst | 9 +++++ .../ptx/instructions/tcgen05_mma_ws.rst | 9 +++++ .../ptx/instructions/tcgen05_shift.rst | 9 +++++ .../ptx/instructions/tcgen05_st.rst | 9 +++++ .../ptx/instructions/tcgen05_wait.rst | 9 +++++ .../cuda/__ptx/instructions/tcgen05_alloc.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_commit.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_cp.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_fence.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_ld.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_mma.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_mma_ws.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_shift.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_st.h | 37 +++++++++++++++++++ .../cuda/__ptx/instructions/tcgen05_wait.h | 37 +++++++++++++++++++ libcudacxx/include/cuda/ptx | 10 +++++ .../ptx/ptx.tcgen05.alloc.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.commit.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.fence.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.mma.ws.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.shift.compile.pass.cpp | 22 +++++++++++ .../cuda/ptx/ptx.tcgen05.st.compile.pass.cpp | 22 +++++++++++ .../ptx/ptx.tcgen05.wait.compile.pass.cpp | 22 +++++++++++ 32 files changed, 700 insertions(+) create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_commit.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_cp.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_fence.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_ld.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_mma.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_shift.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_st.rst create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_wait.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index 87ccc82b5b1..136dfb81fc3 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -29,6 +29,16 @@ PTX Instructions instructions/red_async instructions/st_async instructions/st_bulk + instructions/tcgen05_alloc + instructions/tcgen05_commit + instructions/tcgen05_cp + instructions/tcgen05_fence + instructions/tcgen05_ld + instructions/tcgen05_mma + instructions/tcgen05_mma_ws + instructions/tcgen05_shift + instructions/tcgen05_st + instructions/tcgen05_wait instructions/tensormap_replace instructions/tensormap_cp_fenceproxy instructions/special_registers diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst b/docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst new file mode 100644 index 00000000000..a30f2a2560c --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-alloc: + +tcgen05.alloc +============= + +- PTX ISA: + `tcgen05.alloc `__ + +.. include:: generated/tcgen05_alloc.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_commit.rst b/docs/libcudacxx/ptx/instructions/tcgen05_commit.rst new file mode 100644 index 00000000000..a431350dea8 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_commit.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-commit: + +tcgen05.commit +============== + +- PTX ISA: + `tcgen05.commit `__ + +.. include:: generated/tcgen05_commit.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_cp.rst b/docs/libcudacxx/ptx/instructions/tcgen05_cp.rst new file mode 100644 index 00000000000..5a220536d6e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_cp.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-cp: + +tcgen05.cp +========== + +- PTX ISA: + `tcgen05.cp `__ + +.. include:: generated/tcgen05_cp.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_fence.rst b/docs/libcudacxx/ptx/instructions/tcgen05_fence.rst new file mode 100644 index 00000000000..6635131f707 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_fence.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-fence: + +tcgen05.fence +============= + +- PTX ISA: + `tcgen05.fence `__ + +.. include:: generated/tcgen05_fence.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_ld.rst b/docs/libcudacxx/ptx/instructions/tcgen05_ld.rst new file mode 100644 index 00000000000..165b8eb935a --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_ld.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-ld: + +tcgen05.ld +========== + +- PTX ISA: + `tcgen05.ld `__ + +.. include:: generated/tcgen05_ld.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_mma.rst b/docs/libcudacxx/ptx/instructions/tcgen05_mma.rst new file mode 100644 index 00000000000..9672ae0d0a1 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_mma.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-mma: + +tcgen05.mma +=========== + +- PTX ISA: + `tcgen05.mma `__ + +.. include:: generated/tcgen05_mma.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst b/docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst new file mode 100644 index 00000000000..e22066298ac --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-mma-ws: + +tcgen05.mma.ws +============== + +- PTX ISA: + `tcgen05.mma.ws `__ + +.. include:: generated/tcgen05_mma_ws.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_shift.rst b/docs/libcudacxx/ptx/instructions/tcgen05_shift.rst new file mode 100644 index 00000000000..eef04ae4d5e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_shift.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-shift: + +tcgen05.shift +============= + +- PTX ISA: + `tcgen05.shift `__ + +.. include:: generated/tcgen05_shift.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_st.rst b/docs/libcudacxx/ptx/instructions/tcgen05_st.rst new file mode 100644 index 00000000000..f101149481f --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_st.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-st: + +tcgen05.st +========== + +- PTX ISA: + `tcgen05.st `__ + +.. include:: generated/tcgen05_st.rst diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_wait.rst b/docs/libcudacxx/ptx/instructions/tcgen05_wait.rst new file mode 100644 index 00000000000..cb149e5c9a1 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tcgen05_wait.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tcgen05-wait: + +tcgen05.wait +============ + +- PTX ISA: + `tcgen05.wait `__ + +.. include:: generated/tcgen05_wait.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h new file mode 100644 index 00000000000..743ee4306ee --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_ALLOC_H_ +#define _CUDA_PTX_TCGEN05_ALLOC_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_ALLOC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h new file mode 100644 index 00000000000..ca06ec6b97d --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_COMMIT_H_ +#define _CUDA_PTX_TCGEN05_COMMIT_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_COMMIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h new file mode 100644 index 00000000000..e0c6ebf74ad --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_CP_H_ +#define _CUDA_PTX_TCGEN05_CP_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_CP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h new file mode 100644 index 00000000000..a36847cd0f3 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_FENCE_H_ +#define _CUDA_PTX_TCGEN05_FENCE_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h new file mode 100644 index 00000000000..782ba20e804 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_LD_H_ +#define _CUDA_PTX_TCGEN05_LD_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_LD_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h new file mode 100644 index 00000000000..ff9d159930b --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_MMA_H_ +#define _CUDA_PTX_TCGEN05_MMA_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_MMA_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h new file mode 100644 index 00000000000..5d0bd5b8b5a --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_MMA_WS_H_ +#define _CUDA_PTX_TCGEN05_MMA_WS_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_MMA_WS_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h new file mode 100644 index 00000000000..aab5cbe27b8 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_SHIFT_H_ +#define _CUDA_PTX_TCGEN05_SHIFT_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_SHIFT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h new file mode 100644 index 00000000000..94c86614b1e --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_ST_H_ +#define _CUDA_PTX_TCGEN05_ST_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_ST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h new file mode 100644 index 00000000000..1684d9afd65 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h @@ -0,0 +1,37 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_TCGEN05_WAIT_H_ +#define _CUDA_PTX_TCGEN05_WAIT_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#include + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_TCGEN05_WAIT_H_ diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 0d699b2e2ca..971288b456c 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -91,6 +91,16 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp new file mode 100644 index 00000000000..49f9df928e9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_alloc.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp new file mode 100644 index 00000000000..73ea1851bec --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_commit.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp new file mode 100644 index 00000000000..85ddc17efe4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_cp.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp new file mode 100644 index 00000000000..fda57b348de --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_fence.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp new file mode 100644 index 00000000000..8da8e54f18d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_ld.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp new file mode 100644 index 00000000000..098cbbfa896 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_mma.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp new file mode 100644 index 00000000000..350c964d749 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_mma_ws.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp new file mode 100644 index 00000000000..5ecfff7ff3b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_shift.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp new file mode 100644 index 00000000000..92a49224f0e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_st.h" + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp new file mode 100644 index 00000000000..4bb3156ed12 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +#include "generated/tcgen05_wait.h" + +int main(int, char**) +{ + return 0; +} From cea61a3410fdea796154dcd9157e010659aab837 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 30 Jan 2025 16:48:09 +0100 Subject: [PATCH 25/33] Use a differrent implementation for `tuple_of_iterator_references` to tuple conversion (#3609) --- .../include/cuda/std/detail/libcxx/include/tuple | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple index 6ff1039e61b..47f8b16222b 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple @@ -891,10 +891,19 @@ public: enable_if_t<__is_tuple_of_iterator_references<_TupleOfIteratorReferences>::value, int> = 0, enable_if_t<(tuple_size<_TupleOfIteratorReferences>::value == sizeof...(_Tp)), int> = 0> _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple(_TupleOfIteratorReferences&& __t) - : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t).template __to_tuple<_Tp...>( - __make_tuple_indices_t())) + : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t), + typename __make_tuple_indices::type{}) {} +private: + template ::value, int> = 0> + _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple(_TupleOfIteratorReferences&& __t, __tuple_indices<_Indices...>) + : tuple(_CUDA_VSTD::get<_Indices>(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t))...) + {} + +public: template , enable_if_t::value, int> = 0, From a00de21b5e79aa5c398efeac50b37d99d580c859 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Thu, 30 Jan 2025 10:46:01 -0800 Subject: [PATCH 26/33] Remove CUB `DeviceSpMV` (#3549) --- cub/cub/agent/agent_spmv_orig.cuh | 764 --------------- cub/cub/cub.cuh | 1 - cub/cub/device/device_spmv.cuh | 216 ---- .../device/dispatch/dispatch_spmv_orig.cuh | 924 ------------------ cub/test/test_device_spmv.cu | 611 ------------ 5 files changed, 2516 deletions(-) delete mode 100644 cub/cub/agent/agent_spmv_orig.cuh delete mode 100644 cub/cub/device/device_spmv.cuh delete mode 100644 cub/cub/device/dispatch/dispatch_spmv_orig.cuh delete mode 100644 cub/test/test_device_spmv.cu diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh deleted file mode 100644 index 90a5e3aa6c9..00000000000 --- a/cub/cub/agent/agent_spmv_orig.cuh +++ /dev/null @@ -1,764 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -CUB_NAMESPACE_BEGIN - -/****************************************************************************** - * Tuning policy - ******************************************************************************/ - -/** - * @param Parameterizable tuning policy type for AgentSpmv - * - * @tparam _BLOCK_THREADS - * Threads per thread block - * - * @tparam _ITEMS_PER_THREAD - * Items per thread (per tile of input) - * - * @tparam _ROW_OFFSETS_SEARCH_LOAD_MODIFIER - * Cache load modifier for reading CSR row-offsets during search - * - * @tparam _ROW_OFFSETS_LOAD_MODIFIER - * Cache load modifier for reading CSR row-offsets - * - * @tparam _COLUMN_INDICES_LOAD_MODIFIER - * Cache load modifier for reading CSR column-indices - * - * @tparam _VALUES_LOAD_MODIFIER - * Cache load modifier for reading CSR values - * - * @tparam _VECTOR_VALUES_LOAD_MODIFIER - * Cache load modifier for reading vector values - * - * @tparam _DIRECT_LOAD_NONZEROS - * Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through - * shared memory) - * - * @tparam _SCAN_ALGORITHM - * The BlockScan algorithm to use - */ -template -struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmvPolicy -{ - enum - { - /// Threads per thread block - BLOCK_THREADS = _BLOCK_THREADS, - - /// Items per thread (per tile of input) - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - - /// Whether to load nonzeros directly from global during sequential merging (pre-staged through - /// shared memory) - DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, - }; - - /// Cache load modifier for reading CSR row-offsets - static constexpr CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; - - /// Cache load modifier for reading CSR row-offsets - static constexpr CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; - - /// Cache load modifier for reading CSR column-indices - static constexpr CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; - - /// Cache load modifier for reading CSR values - static constexpr CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; - - /// Cache load modifier for reading vector values - static constexpr CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; - - /// The BlockScan algorithm to use - static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; -}; - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - */ -template -struct -// with NVHPC, we get a deprecation warning in the implementation of cudaLaunchKernelEx, which we cannot suppress :/ -#if !_CCCL_COMPILER(NVHPC) - CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -#endif - SpmvParams -{ - /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix - /// A. - const ValueT* d_values; - - /// Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices - /// and \p d_values - const OffsetT* d_row_end_offsets; - - /// Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements - /// of matrix A. (Indices are zero-valued.) - const OffsetT* d_column_indices; - - /// Pointer to the array of \p num_cols values corresponding to the dense input vector x - const ValueT* d_vector_x; - - /// Pointer to the array of \p num_rows values corresponding to the dense output vector y - ValueT* d_vector_y; - - /// Number of rows of matrix A. - int num_rows; - - /// Number of columns of matrix A. - int num_cols; - - /// Number of nonzero elements of matrix A. - int num_nonzeros; - - /// Alpha multiplicand - ValueT alpha; - - /// Beta addend-multiplicand - ValueT beta; -}; - -/** - * @brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. - * - * @tparam AgentSpmvPolicyT - * Parameterized AgentSpmvPolicy tuning policy type - * - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @tparam HAS_ALPHA - * Whether the input parameter \p alpha is 1 - * - * @tparam HAS_BETA - * Whether the input parameter \p beta is 0 - * - * @tparam LEGACY_PTX_ARCH - * PTX compute capability (unused) - */ -template -struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - /// 2D merge path coordinate type - using CoordinateT = typename CubVector::Type; - - /// Input iterator wrapper types (for applying cache modifiers) - - using RowOffsetsSearchIteratorT = - CacheModifiedInputIterator; - - using RowOffsetsIteratorT = CacheModifiedInputIterator; - - using ColumnIndicesIteratorT = - CacheModifiedInputIterator; - - using ValueIteratorT = CacheModifiedInputIterator; - - using VectorValueIteratorT = - CacheModifiedInputIterator; - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - using KeyValuePairT = KeyValuePair; - - // Reduce-value-by-segment scan operator - using ReduceBySegmentOpT = ReduceByKeyOp<::cuda::std::plus<>>; - - // BlockReduce specialization - using BlockReduceT = BlockReduce; - - // BlockScan specialization - using BlockScanT = BlockScan; - - // BlockScan specialization - using BlockPrefixSumT = BlockScan; - - // BlockExchange specialization - using BlockExchangeT = BlockExchange; - - /// Merge item type (either a non-zero value or a row-end offset) - union MergeItem - { - // Value type to pair with index type OffsetT - // (NullType if loading values directly during merge) - using MergeValueT = ::cuda::std::_If; - - OffsetT row_end_offset; - MergeValueT nonzero; - }; - - /// Shared memory type required by this thread block - struct _TempStorage - { - CoordinateT tile_coords[2]; - - union Aliasable - { - // Smem needed for tile of merge items - MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1]; - - // Smem needed for block exchange - typename BlockExchangeT::TempStorage exchange; - - // Smem needed for block-wide reduction - typename BlockReduceT::TempStorage reduce; - - // Smem needed for tile scanning - typename BlockScanT::TempStorage scan; - - // Smem needed for tile prefix sum - typename BlockPrefixSumT::TempStorage prefix_sum; - - } aliasable; - }; - - /// Temporary storage type (unionable) - struct TempStorage : Uninitialized<_TempStorage> - {}; - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - /// Reference to temp_storage - _TempStorage& temp_storage; - - _CCCL_SUPPRESS_DEPRECATED_PUSH - SpmvParams& spmv_params; - _CCCL_SUPPRESS_DEPRECATED_POP - - /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements - /// of matrix A. - ValueIteratorT wd_values; - - /// Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p - /// d_column_indices and \p d_values - RowOffsetsIteratorT wd_row_end_offsets; - - /// Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero - /// elements of matrix A. (Indices are zero-valued.) - ColumnIndicesIteratorT wd_column_indices; - - /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector - /// x - VectorValueIteratorT wd_vector_x; - - /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector - /// x - VectorValueIteratorT wd_vector_y; - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * @param temp_storage - * Reference to temp_storage - * - * @param spmv_params - * SpMV input parameter bundle - */ - _CCCL_SUPPRESS_DEPRECATED_PUSH - _CCCL_DEVICE _CCCL_FORCEINLINE AgentSpmv(TempStorage& temp_storage, SpmvParams& spmv_params) - : temp_storage(temp_storage.Alias()) - , spmv_params(spmv_params) - , wd_values(spmv_params.d_values) - , wd_row_end_offsets(spmv_params.d_row_end_offsets) - , wd_column_indices(spmv_params.d_column_indices) - , wd_vector_x(spmv_params.d_vector_x) - , wd_vector_y(spmv_params.d_vector_y) - {} - _CCCL_SUPPRESS_DEPRECATED_POP - - /** - * @brief Consume a merge tile, specialized for direct-load of nonzeros - * - * @param is_direct_load - * Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - */ - _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT - ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type is_direct_load) - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - - // Gather the row end-offsets for the merge tile into shared memory - for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) - { - const OffsetT offset = (::cuda::std::min)( - static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); - s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; - } - - __syncthreads(); - - // Search for the thread's starting coordinate within the merge tile - _CCCL_SUPPRESS_DEPRECATED_PUSH - CountingInputIterator tile_nonzero_indices(tile_start_coord.y); - _CCCL_SUPPRESS_DEPRECATED_POP - CoordinateT thread_start_coord; - - MergePathSearch( - OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal - s_tile_row_end_offsets, // List A - tile_nonzero_indices, // List B - tile_num_rows, - tile_num_nonzeros, - thread_start_coord); - - __syncthreads(); // Perf-sync - - // Compute the thread's merge path segment - CoordinateT thread_current_coord = thread_start_coord; - KeyValuePairT scan_segment[ITEMS_PER_THREAD]; - - ValueT running_total = 0.0; - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1); - OffsetT column_idx = wd_column_indices[nonzero_idx]; - ValueT value = wd_values[nonzero_idx]; - - ValueT vector_value = wd_vector_x[column_idx]; - - ValueT nonzero = value * vector_value; - - OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - - if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) - { - // Move down (accumulate) - running_total += nonzero; - scan_segment[ITEM].value = running_total; - scan_segment[ITEM].key = tile_num_rows; - ++thread_current_coord.y; - } - else - { - // Move right (reset) - scan_segment[ITEM].value = running_total; - scan_segment[ITEM].key = thread_current_coord.x; - running_total = 0.0; - ++thread_current_coord.x; - } - } - - __syncthreads(); - - // Block-wide reduce-value-by-segment - KeyValuePairT tile_carry; - ReduceBySegmentOpT scan_op; - KeyValuePairT scan_item; - - scan_item.value = running_total; - scan_item.key = thread_current_coord.x; - - BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); - - if (tile_num_rows > 0) - { - if (threadIdx.x == 0) - { - scan_item.key = -1; - } - -// Direct scatter -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scan_segment[ITEM].key < tile_num_rows) - { - if (scan_item.key == scan_segment[ITEM].key) - { - scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value; - } - - if (HAS_ALPHA) - { - scan_segment[ITEM].value *= spmv_params.alpha; - } - - if (HAS_BETA) - { - // Update the output vector element - ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]; - scan_segment[ITEM].value += addend; - } - - // Set the output vector element - spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value; - } - } - } - - // Return the tile's running carry-out - return tile_carry; - } - - /** - * @brief Consume a merge tile, specialized for indirect load of nonzeros - * - * @param is_direct_load - * Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - */ - _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT - ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type is_direct_load) - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - -#if (CUB_PTX_ARCH >= 520) - - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; - -// Gather the nonzeros for the merge tile into shared memory -# pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - - ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx; - ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx; - ValueT* s = s_tile_nonzeros + nonzero_idx; - - if (nonzero_idx < tile_num_nonzeros) - { - OffsetT column_idx = *ci; - ValueT value = *a; - - ValueT vector_value = wd_vector_x[column_idx]; - - ValueT nonzero = value * vector_value; - - *s = nonzero; - } - } - -#else - - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; - - // Gather the nonzeros for the merge tile into shared memory - if (tile_num_nonzeros > 0) - { -# pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); - - OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; - ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; - - ValueT vector_value = wd_vector_x[column_idx]; - - ValueT nonzero = value * vector_value; - - s_tile_nonzeros[nonzero_idx] = nonzero; - } - } - -#endif - -// Gather the row end-offsets for the merge tile into shared memory -#pragma unroll 1 - for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) - { - const OffsetT offset = (::cuda::std::min)( - static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); - s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; - } - - __syncthreads(); - - // Search for the thread's starting coordinate within the merge tile - _CCCL_SUPPRESS_DEPRECATED_PUSH - CountingInputIterator tile_nonzero_indices(tile_start_coord.y); - _CCCL_SUPPRESS_DEPRECATED_POP - CoordinateT thread_start_coord; - - MergePathSearch( - OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal - s_tile_row_end_offsets, // List A - tile_nonzero_indices, // List B - tile_num_rows, - tile_num_nonzeros, - thread_start_coord); - - __syncthreads(); // Perf-sync - - // Compute the thread's merge path segment - CoordinateT thread_current_coord = thread_start_coord; - KeyValuePairT scan_segment[ITEMS_PER_THREAD]; - ValueT running_total = 0.0; - - OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - ValueT nonzero = s_tile_nonzeros[thread_current_coord.y]; - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) - { - // Move down (accumulate) - scan_segment[ITEM].value = nonzero; - running_total += nonzero; - ++thread_current_coord.y; - nonzero = s_tile_nonzeros[thread_current_coord.y]; - } - else - { - // Move right (reset) - scan_segment[ITEM].value = 0.0; - running_total = 0.0; - ++thread_current_coord.x; - row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - } - - scan_segment[ITEM].key = thread_current_coord.x; - } - - __syncthreads(); - - // Block-wide reduce-value-by-segment - KeyValuePairT tile_carry; - ReduceBySegmentOpT scan_op; - KeyValuePairT scan_item; - - scan_item.value = running_total; - scan_item.key = thread_current_coord.x; - - BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); - - if (threadIdx.x == 0) - { - scan_item.key = thread_start_coord.x; - scan_item.value = 0.0; - } - - if (tile_num_rows > 0) - { - __syncthreads(); - - // Scan downsweep and scatter - ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; - - if (scan_item.key != scan_segment[0].key) - { - s_partials[scan_item.key] = scan_item.value; - } - else - { - scan_segment[0].value += scan_item.value; - } - -#pragma unroll - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key) - { - s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value; - } - else - { - scan_segment[ITEM].value += scan_segment[ITEM - 1].value; - } - } - - __syncthreads(); - -#pragma unroll 1 - for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) - { - spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item]; - } - } - - // Return the tile's running carry-out - return tile_carry; - } - - /** - * @brief Consume input tile - * - * @param[in] d_tile_coordinates - * Pointer to the temporary array of tile starting coordinates - * - * @param[out] d_tile_carry_pairs - * Pointer to the temporary array carry-out dot product row-ids, one per block - * - * @param[in] num_merge_tiles - * Number of merge tiles - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void - ConsumeTile(CoordinateT* d_tile_coordinates, KeyValuePairT* d_tile_carry_pairs, int num_merge_tiles) - { - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - - if (tile_idx >= num_merge_tiles) - { - return; - } - - // Read our starting coordinates - if (threadIdx.x < 2) - { - if (d_tile_coordinates == nullptr) - { - // Search our starting coordinates - OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; - CoordinateT tile_coord; - _CCCL_SUPPRESS_DEPRECATED_PUSH - CountingInputIterator nonzero_indices(0); - _CCCL_SUPPRESS_DEPRECATED_POP - - // Search the merge path - MergePathSearch( - diagonal, - RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), - nonzero_indices, - spmv_params.num_rows, - spmv_params.num_nonzeros, - tile_coord); - - temp_storage.tile_coords[threadIdx.x] = tile_coord; - } - else - { - temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x]; - } - } - - __syncthreads(); - - CoordinateT tile_start_coord = temp_storage.tile_coords[0]; - CoordinateT tile_end_coord = temp_storage.tile_coords[1]; - - // Consume multi-segment tile - KeyValuePairT tile_carry = - ConsumeTile(tile_idx, tile_start_coord, tile_end_coord, Int2Type()); - - // Output the tile's carry-out - if (threadIdx.x == 0) - { - if (HAS_ALPHA) - { - tile_carry.value *= spmv_params.alpha; - } - - tile_carry.key += tile_start_coord.x; - if (tile_carry.key >= spmv_params.num_rows) - { - // FIXME: This works around an invalid memory access in the - // fixup kernel. The underlying issue needs to be debugged and - // properly fixed, but this hack prevents writes to - // out-of-bounds addresses. It doesn't appear to have an effect - // on the validity of the results, since this only affects the - // carry-over from last tile in the input. - tile_carry.key = spmv_params.num_rows - 1; - tile_carry.value = ValueT{}; - }; - - d_tile_carry_pairs[tile_idx] = tile_carry; - } - } -}; - -CUB_NAMESPACE_END diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh index 2c4d6dd5f4e..ce55c879e0c 100644 --- a/cub/cub/cub.cuh +++ b/cub/cub/cub.cuh @@ -75,7 +75,6 @@ #include #include #include -#include #include // Grid diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh deleted file mode 100644 index 241af8cd1d1..00000000000 --- a/cub/cub/device/device_spmv.cuh +++ /dev/null @@ -1,216 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -//! @file -//! cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication -//! (SpMV). - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include - -#include -#include - -#include -#include - -#include - -CUB_NAMESPACE_BEGIN - -//! @rst -//! DeviceSpmv provides device-wide parallel operations for performing -//! sparse-matrix * dense-vector multiplication (SpMV). -//! -//! Overview -//! +++++++++++++++++++++++++++++++++++++++++++++ -//! -//! The `SpMV computation `_ -//! performs the matrix-vector operation ``y = A * x + y``, where: -//! -//! - ``A`` is an ``m * n`` sparse matrix whose non-zero structure is specified in -//! `compressed-storage-row (CSR) format -//! `_ (i.e., three -//! arrays: -//! ``values``, ``row_offsets``, and ``column_indices``) -//! - ``x`` and ``y`` are dense vectors -//! -//! Usage Considerations -//! +++++++++++++++++++++++++++++++++++++++++++++ -//! -//! @cdp_class{DeviceSpmv} -//! -//! @endrst -struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DeviceSpmv -{ - //! @name CSR matrix operations - //! @{ - - //! @rst - //! This function performs the matrix-vector operation ``y = A*x``. - //! - //! Snippet - //! +++++++++++++++++++++++++++++++++++++++++++++ - //! - //! The code snippet below illustrates SpMV upon a 9x9 CSR matrix ``A`` representing a 3x3 lattice (24 non-zeros). - //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! // Declare, allocate, and initialize device-accessible pointers for input matrix A, input - //! vector x, - //! // and output vector y - //! int num_rows = 9; - //! int num_cols = 9; - //! int num_nonzeros = 24; - //! - //! float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, - //! // 1, 1, 1, 1, 1, 1, 1, 1, - //! // 1, 1, 1, 1, 1, 1, 1, 1] - //! - //! int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, - //! // 4, 6, 1, 3, 5, 7, 2, 4, - //! // 8, 3, 7, 4, 6, 8, 5, 7] - //! - //! int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] - //! - //! float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] - //! float* d_vector_y; // e.g., [ , , , , , , , , ] - //! ... - //! - //! // Determine temporary device storage requirements - //! void* d_temp_storage = nullptr; - //! size_t temp_storage_bytes = 0; - //! cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - //! d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - //! num_rows, num_cols, num_nonzeros); - //! - //! // Allocate temporary storage - //! cudaMalloc(&d_temp_storage, temp_storage_bytes); - //! - //! // Run SpMV - //! cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - //! d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - //! num_rows, num_cols, num_nonzeros); - //! - //! // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] - //! - //! @endrst - //! - //! @tparam ValueT - //! **[inferred]** Matrix and vector value type (e.g., `float`, `double`, etc.) - //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. - //! When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done. - //! - //! @param[in,out] temp_storage_bytes - //! Reference to size in bytes of `d_temp_storage` allocation - //! - //! @param[in] d_values - //! Pointer to the array of `num_nonzeros` values of the corresponding nonzero elements - //! of matrix `A`. - //! - //! @param[in] d_row_offsets - //! Pointer to the array of `m + 1` offsets demarcating the start of every row in - //! `d_column_indices` and `d_values` (with the final entry being equal to `num_nonzeros`) - //! - //! @param[in] d_column_indices - //! Pointer to the array of `num_nonzeros` column-indices of the corresponding nonzero - //! elements of matrix `A`. (Indices are zero-valued.) - //! - //! @param[in] d_vector_x - //! Pointer to the array of `num_cols` values corresponding to the dense input vector `x` - //! - //! @param[out] d_vector_y - //! Pointer to the array of `num_rows` values corresponding to the dense output vector `y` - //! - //! @param[in] num_rows - //! number of rows of matrix `A`. - //! - //! @param[in] num_cols - //! number of columns of matrix `A`. - //! - //! @param[in] num_nonzeros - //! number of nonzero elements of matrix `A`. - //! - //! @param[in] stream - //! @rst - //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. - //! @endrst - template - CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") - CUB_RUNTIME_FUNCTION static cudaError_t - CsrMV(void* d_temp_storage, - size_t& temp_storage_bytes, - const ValueT* d_values, - const int* d_row_offsets, - const int* d_column_indices, - const ValueT* d_vector_x, - ValueT* d_vector_y, - int num_rows, - int num_cols, - int num_nonzeros, - cudaStream_t stream = 0) - { - CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSpmv::CsrMV"); - - SpmvParams spmv_params; - spmv_params.d_values = d_values; - spmv_params.d_row_end_offsets = d_row_offsets + 1; - spmv_params.d_column_indices = d_column_indices; - spmv_params.d_vector_x = d_vector_x; - spmv_params.d_vector_y = d_vector_y; - spmv_params.num_rows = num_rows; - spmv_params.num_cols = num_cols; - spmv_params.num_nonzeros = num_nonzeros; - spmv_params.alpha = ValueT{1}; - spmv_params.beta = ValueT{0}; - - _CCCL_SUPPRESS_DEPRECATED_PUSH - return DispatchSpmv::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); - _CCCL_SUPPRESS_DEPRECATED_POP - } - - //! @} end member group -}; - -CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh deleted file mode 100644 index 16353f392dc..00000000000 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ /dev/null @@ -1,924 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * @file - * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector - * multiplication (SpMV). - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include - -CUB_NAMESPACE_BEGIN - -/****************************************************************************** - * SpMV kernel entry points - *****************************************************************************/ - -/** - * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile. - * - * @tparam AgentSpmvPolicyT - * Parameterized SpmvPolicy tuning policy type - * - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @param[in] spmv_params - * SpMV input parameter bundle - */ -_CCCL_SUPPRESS_DEPRECATED_PUSH -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams spmv_params) // - _CCCL_SUPPRESS_DEPRECATED_POP -{ - using VectorValueIteratorT = - CacheModifiedInputIterator; - - VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); - - int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (row_idx < spmv_params.num_rows) - { - OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; - OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; - - ValueT value = 0.0; - if (end_nonzero_idx != nonzero_idx) - { - value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; - } - - spmv_params.d_vector_y[row_idx] = value; - } -} - -/** - * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile. - * - * @tparam SpmvPolicyT - * Parameterized SpmvPolicy tuning policy type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @tparam CoordinateT - * Merge path coordinate type - * - * @tparam SpmvParamsT - * SpmvParams type - * - * @param[in] num_merge_tiles - * Number of SpMV merge tiles (spmv grid size) - * - * @param[out] d_tile_coordinates - * Pointer to the temporary array of tile starting coordinates - * - * @param[in] spmv_params - * SpMV input parameter bundle - */ -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel( - int num_merge_tiles, CoordinateT* d_tile_coordinates, SpmvParamsT spmv_params) -{ - /// Constants - enum - { - BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - using RowOffsetsSearchIteratorT = - CacheModifiedInputIterator; - - // Find the starting coordinate for all tiles (plus the end coordinate of the last one) - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tile_idx < num_merge_tiles + 1) - { - OffsetT diagonal = (tile_idx * TILE_ITEMS); - CoordinateT tile_coordinate; - _CCCL_SUPPRESS_DEPRECATED_PUSH - CountingInputIterator nonzero_indices(0); - _CCCL_SUPPRESS_DEPRECATED_POP - - // Search the merge path - MergePathSearch( - diagonal, - RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), - nonzero_indices, - spmv_params.num_rows, - spmv_params.num_nonzeros, - tile_coordinate); - - // Output starting offset - d_tile_coordinates[tile_idx] = tile_coordinate; - } -} - -/** - * @brief Spmv agent entry point - * - * @tparam SpmvPolicyT - * Parameterized SpmvPolicy tuning policy type - * - * @tparam ScanTileStateT - * Tile status interface type - * - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @tparam CoordinateT - * Merge path coordinate type - * - * @tparam HAS_ALPHA - * Whether the input parameter Alpha is 1 - * - * @tparam HAS_BETA - * Whether the input parameter Beta is 0 - * - * @param[in] spmv_params - * SpMV input parameter bundle - * - * @param[in] d_tile_coordinates - * Pointer to the temporary array of tile starting coordinates - * - * @param[out] d_tile_carry_pairs - * Pointer to the temporary array carry-out dot product row-ids, one per block - * - * @param[in] num_tiles - * Number of merge tiles - * - * @param[in] tile_state - * Tile status interface for fixup reduce-by-key kernel - * - * @param[in] num_segment_fixup_tiles - * Number of reduce-by-key tiles (fixup grid size) - */ -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -__launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel( - SpmvParams spmv_params, - CoordinateT* d_tile_coordinates, - KeyValuePair* d_tile_carry_pairs, - int num_tiles, - ScanTileStateT tile_state, - int num_segment_fixup_tiles) -{ - // Spmv agent type specialization - _CCCL_SUPPRESS_DEPRECATED_PUSH - using AgentSpmvT = AgentSpmv; - _CCCL_SUPPRESS_DEPRECATED_POP - - // Shared memory for AgentSpmv - __shared__ typename AgentSpmvT::TempStorage temp_storage; - - AgentSpmvT(temp_storage, spmv_params).ConsumeTile(d_tile_coordinates, d_tile_carry_pairs, num_tiles); - - // Initialize fixup tile status - tile_state.InitializeStatus(num_segment_fixup_tiles); -} - -/** - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for sequence offsets - * - * @tparam HAS_BETA - * Whether the input parameter Beta is 0 - */ -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams spmv_params) -{ - const int row = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - - if (row < spmv_params.num_rows) - { - ValueT result = 0.0; - - _CCCL_IF_CONSTEXPR (HAS_BETA) - { - result += spmv_params.beta * spmv_params.d_vector_y[row]; - } - - spmv_params.d_vector_y[row] = result; - } -} - -/** - * @brief Multi-block reduce-by-key sweep kernel entry point - * - * @tparam AgentSegmentFixupPolicyT - * Parameterized AgentSegmentFixupPolicy tuning policy type - * - * @tparam PairsInputIteratorT - * Random-access input iterator type for keys - * - * @tparam AggregatesOutputIteratorT - * Random-access output iterator type for values - * - * @tparam OffsetT - * Signed integer type for global offsets - * - * @tparam ScanTileStateT - * Tile status interface type - * - * @param[in] d_pairs_in - * Pointer to the array carry-out dot product row-ids, one per spmv block - * - * @param[in,out] d_aggregates_out - * Output value aggregates - * - * @param[in] num_items - * Total number of items to select from - * - * @param[in] num_tiles - * Total number of tiles for the entire problem - * - * @param[in] tile_state - * Tile status interface - */ -_CCCL_SUPPRESS_DEPRECATED_PUSH -template -CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") -__launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) - CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel( - PairsInputIteratorT d_pairs_in, - AggregatesOutputIteratorT d_aggregates_out, - OffsetT num_items, - int num_tiles, - ScanTileStateT tile_state) // - _CCCL_SUPPRESS_DEPRECATED_POP -{ - // Thread block type for reducing tiles of value segments - using AgentSegmentFixupT = - AgentSegmentFixup, - ::cuda::std::plus<>, - OffsetT>; - - // Shared memory for AgentSegmentFixup - __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; - - // Process tiles - AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, ::cuda::std::equal_to<>{}, ::cuda::std::plus<>{}) - .ConsumeRange(num_items, num_tiles, tile_state); -} - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv - * - * @tparam ValueT - * Matrix and vector value type - * - * @tparam OffsetT - * Signed integer type for global offsets - */ -template -struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv -{ - //--------------------------------------------------------------------- - // Constants and Types - //--------------------------------------------------------------------- - - enum - { - INIT_KERNEL_THREADS = 128, - EMPTY_MATRIX_KERNEL_THREADS = 128 - }; - - // SpmvParams bundle type - using SpmvParamsT = SpmvParams; - - // 2D merge path coordinate type - using CoordinateT = typename CubVector::Type; - - // Tile status descriptor interface type - using ScanTileStateT = ReduceByKeyScanTileState; - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - using KeyValuePairT = KeyValuePair; - - //--------------------------------------------------------------------- - // Tuning policies - //--------------------------------------------------------------------- - - /// SM50 - struct Policy500 - { - using SpmvPolicyT = - AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128, - (sizeof(ValueT) > 4) ? 6 : 7, - LOAD_LDG, - LOAD_DEFAULT, - (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, - (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, - LOAD_LDG, - (sizeof(ValueT) > 4) ? true : false, - (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>; - - using SegmentFixupPolicyT = - AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE>; - }; - - /// SM60 - struct Policy600 - { - using SpmvPolicyT = - AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128, - (sizeof(ValueT) > 4) ? 5 : 7, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS>; - - using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>; - }; - - //--------------------------------------------------------------------- - // Tuning policies of current PTX compiler pass - //--------------------------------------------------------------------- - -#if (CUB_PTX_ARCH >= 600) - using PtxPolicy = Policy600; - -#else - using PtxPolicy = Policy500; -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT - {}; - struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT - {}; - - //--------------------------------------------------------------------- - // Utilities - //--------------------------------------------------------------------- - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static void - InitConfigs(int ptx_version, KernelConfig& spmv_config, KernelConfig& segment_fixup_config) - { - NV_IF_TARGET( - NV_IS_DEVICE, - ( // We're on the device, so initialize the kernel dispatch - // configurations with the current PTX policy - spmv_config.template Init(); segment_fixup_config.template Init();), - ( - // We're on the host, so lookup and initialize the kernel dispatch - // configurations with the policies that match the device's PTX - // version - if (ptx_version >= 600) { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } else if (ptx_version >= 500) { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } else { - spmv_config.template Init(); - segment_fixup_config.template Init(); - })); - } - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Internal dispatch routine for computing a device-wide reduction using the - * specified kernel functions. - * - * If the input is larger than a single tile, this method uses two-passes of - * kernel invocations. - * - * @tparam Spmv1ColKernelT - * Function type of cub::DeviceSpmv1ColKernel - * - * @tparam SpmvSearchKernelT - * Function type of cub::AgentSpmvSearchKernel - * - * @tparam SpmvKernelT - * Function type of cub::AgentSpmvKernel - * - * @tparam SegmentFixupKernelT - * Function type of cub::DeviceSegmentFixupKernelT - * - * @tparam SpmvEmptyMatrixKernelT - * Function type of cub::DeviceSpmvEmptyMatrixKernel - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. - * When nullptr, the required allocation size is written to - * `temp_storage_bytes` and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of \p d_temp_storage allocation - * - * @paramSpMV spmv_params - * input parameter bundle - * - * @param[in] stream - * CUDA stream to launch kernels within. Default is stream0. - * - * @param[in] spmv_1col_kernel - * Kernel function pointer to parameterization of DeviceSpmv1ColKernel - * - * @param[in] spmv_search_kernel - * Kernel function pointer to parameterization of AgentSpmvSearchKernel - * - * @param[in] spmv_kernel - * Kernel function pointer to parameterization of AgentSpmvKernel - * - * @param[in] segment_fixup_kernel - * Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel - * - * @param[in] spmv_empty_matrix_kernel - * Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel - * - * @param[in] spmv_config - * Dispatch parameters that match the policy that @p spmv_kernel was compiled for - * - * @param[in] segment_fixup_config - * Dispatch parameters that match the policy that @p segment_fixup_kernel was compiled for - */ - template - CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - SpmvParamsT& spmv_params, - cudaStream_t stream, - Spmv1ColKernelT spmv_1col_kernel, - SpmvSearchKernelT spmv_search_kernel, - SpmvKernelT spmv_kernel, - SegmentFixupKernelT segment_fixup_kernel, - SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel, - KernelConfig spmv_config, - KernelConfig segment_fixup_config) - { - cudaError error = cudaSuccess; - do - { - if (spmv_params.num_rows < 0 || spmv_params.num_cols < 0) - { - return cudaErrorInvalidValue; - } - - if (spmv_params.num_rows == 0 || spmv_params.num_cols == 0) - { // Empty problem, no-op. - if (d_temp_storage == nullptr) - { - temp_storage_bytes = 1; - } - - break; - } - - if (spmv_params.num_nonzeros == 0) - { - if (d_temp_storage == nullptr) - { - // Return if the caller is simply requesting the size of the storage allocation - temp_storage_bytes = 1; - break; - } - - constexpr int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS; - const int blocks_in_grid = ::cuda::ceil_div(spmv_params.num_rows, threads_in_block); - -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n", - blocks_in_grid, - threads_in_block, - (long long) stream); -#endif // CUB_DEBUG_LOG - error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) - .doit(spmv_empty_matrix_kernel, spmv_params); - - if (CubDebug(error)) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - - break; - } - - if (spmv_params.num_cols == 1) - { - if (d_temp_storage == nullptr) - { - // Return if the caller is simply requesting the size of the storage allocation - temp_storage_bytes = 1; - break; - } - - // Get search/init grid dims - int degen_col_kernel_block_size = INIT_KERNEL_THREADS; - int degen_col_kernel_grid_size = ::cuda::ceil_div(spmv_params.num_rows, degen_col_kernel_block_size); - -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", - degen_col_kernel_grid_size, - degen_col_kernel_block_size, - (long long) stream); -#endif // CUB_DEBUG_LOG - - // Invoke spmv_search_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream) - .doit(spmv_1col_kernel, spmv_params); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - - break; - } - - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) - { - break; - } - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) - { - break; - } - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) - { - break; - } - - // Total number of spmv work items - int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; - - // Tile sizes of kernels - int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; - int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; - - // Number of tiles for kernels - int num_merge_tiles = ::cuda::ceil_div(num_merge_items, merge_tile_size); - int num_segment_fixup_tiles = ::cuda::ceil_div(num_merge_tiles, segment_fixup_tile_size); - - // Get SM occupancy for kernels - int spmv_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(spmv_sm_occupancy, spmv_kernel, spmv_config.block_threads))) - { - break; - } - - int segment_fixup_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - segment_fixup_sm_occupancy, segment_fixup_kernel, segment_fixup_config.block_threads))) - { - break; - } - - // Get grid dimensions - dim3 spmv_grid_size(CUB_MIN(num_merge_tiles, max_dim_x), ::cuda::ceil_div(num_merge_tiles, max_dim_x), 1); - - dim3 segment_fixup_grid_size( - CUB_MIN(num_segment_fixup_tiles, max_dim_x), ::cuda::ceil_div(num_segment_fixup_tiles, max_dim_x), 1); - - // Get the temporary storage allocation requirements - size_t allocation_sizes[3]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) - { - break; // bytes needed for reduce-by-key tile status descriptors - } - allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs - allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - void* allocations[3] = {}; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) - { - break; - } - if (d_temp_storage == nullptr) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) - { - break; - } - - // Alias the other allocations - KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs - CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates - - // Get search/init grid dims - int search_block_size = INIT_KERNEL_THREADS; - int search_grid_size = ::cuda::ceil_div(num_merge_tiles + 1, search_block_size); - - if (search_grid_size < sm_count) - // if (num_merge_tiles < spmv_sm_occupancy * sm_count) - { - // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords - d_tile_coordinates = nullptr; - } - else - { -// Use separate search kernel if we have enough spmv tiles to saturate the device - -// Log spmv_search_kernel configuration -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", - search_grid_size, - search_block_size, - (long long) stream); -#endif // CUB_DEBUG_LOG - - // Invoke spmv_search_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream) - .doit(spmv_search_kernel, num_merge_tiles, d_tile_coordinates, spmv_params); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - } - -// Log spmv_kernel configuration -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - spmv_grid_size.x, - spmv_grid_size.y, - spmv_grid_size.z, - spmv_config.block_threads, - (long long) stream, - spmv_config.items_per_thread, - spmv_sm_occupancy); -#endif // CUB_DEBUG_LOG - - // Invoke spmv_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream) - .doit(spmv_kernel, - spmv_params, - d_tile_coordinates, - d_tile_carry_pairs, - num_merge_tiles, - tile_state, - num_segment_fixup_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - - // Run reduce-by-key fixup if necessary - if (num_merge_tiles > 1) - { -// Log segment_fixup_kernel configuration -#ifdef CUB_DEBUG_LOG - _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - segment_fixup_grid_size.x, - segment_fixup_grid_size.y, - segment_fixup_grid_size.z, - segment_fixup_config.block_threads, - (long long) stream, - segment_fixup_config.items_per_thread, - segment_fixup_sm_occupancy); -#endif // CUB_DEBUG_LOG - - // Invoke segment_fixup_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream) - .doit(segment_fixup_kernel, - d_tile_carry_pairs, - spmv_params.d_vector_y, - num_merge_tiles, - num_segment_fixup_tiles, - tile_state); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) - { - break; - } - - // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) - { - break; - } - } - } while (0); - - return error; - } - - /** - * @brief Internal dispatch routine for computing a device-wide reduction - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. - * When nullptr, the required allocation size is written to - * `temp_storage_bytes` and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param SpMV spmv_params - * input parameter bundle - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. Default is stream0. - */ - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t - Dispatch(void* d_temp_storage, size_t& temp_storage_bytes, SpmvParamsT& spmv_params, cudaStream_t stream = 0) - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) - { - break; - } - - // Get kernel kernel dispatch configurations - KernelConfig spmv_config, segment_fixup_config; - InitConfigs(ptx_version, spmv_config, segment_fixup_config); - - constexpr bool has_alpha = false; - constexpr bool has_beta = false; - - if (CubDebug( - error = Dispatch( - d_temp_storage, - temp_storage_bytes, - spmv_params, - stream, - DeviceSpmv1ColKernel, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - DeviceSpmvEmptyMatrixKernel, - spmv_config, - segment_fixup_config))) - { - break; - } - - } while (0); - - return error; - } -}; - -CUB_NAMESPACE_END diff --git a/cub/test/test_device_spmv.cu b/cub/test/test_device_spmv.cu deleted file mode 100644 index 13dba77a594..00000000000 --- a/cub/test/test_device_spmv.cu +++ /dev/null @@ -1,611 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -// Ensure printing of CUDA runtime errors to console -#define CUB_STDERR - -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "test_util.h" -#include -#include - -_CCCL_SUPPRESS_DEPRECATED_PUSH - -bool g_verbose = false; - -//============================================================================== -// Casts char types to int for numeric printing -template -T print_cast(T val) -{ - return val; -} - -int print_cast(char val) -{ - return static_cast(val); -} - -int print_cast(signed char val) -{ - return static_cast(val); -} - -int print_cast(unsigned char val) -{ - return static_cast(val); -} - -//============================================================================== -// Print a vector to out -template -void print_vector(std::ostream& out, const VectorT& vec) -{ - bool first = true; - for (const auto& val : vec) - { - if (!first) - { - out << ", "; - } - first = false; - out << print_cast(val); - } -} - -//============================================================================== -// Simple CSR matrix implementation. -// HostStorage controls whether data is stored on the host or device. -// Use the host_csr_matrix and device_csr_matrix aliases for code clarity. -template -struct csr_matrix -{ - csr_matrix(int num_rows, int num_cols) - : m_row_offsets(static_cast(num_rows + 1), 0) - , m_num_rows(num_rows) - , m_num_columns(num_cols) - {} - - // host/device conversion constructor - explicit csr_matrix(const csr_matrix& other) - : m_values(other.m_values) - , m_row_offsets(other.m_row_offsets) - , m_column_indices(other.m_column_indices) - , m_num_rows(other.m_num_rows) - , m_num_columns(other.m_num_columns) - , m_num_nonzeros(other.m_num_nonzeros) - {} - - // Note that this must append to the values array. Finish filling each row - // before adding to the next, and each row's columns must be added in order. - // Must call `finalize` once all items are added. - void append_value(int row, int col, ValueT value) - { - ++m_num_nonzeros; - ++m_row_offsets[row]; - m_column_indices.push_back(col); - m_values.push_back(std::move(value)); - } - - void finalize() - { - _CCCL_IF_CONSTEXPR (HostStorage) - { - thrust::exclusive_scan(thrust::host, m_row_offsets.cbegin(), m_row_offsets.cend(), m_row_offsets.begin()); - } - else - { - thrust::exclusive_scan(c2h::device_policy, m_row_offsets.cbegin(), m_row_offsets.cend(), m_row_offsets.begin()); - } - AssertEquals(m_row_offsets.back(), m_num_nonzeros); - } - - const ValueT* get_values() const - { - return thrust::raw_pointer_cast(m_values.data()); - } - - const int* get_row_offsets() const - { - return thrust::raw_pointer_cast(m_row_offsets.data()); - } - - int get_row_offset(int row) const - { - return m_row_offsets[row]; - } - - int get_row_num_nonzero(int row) const - { - return m_row_offsets[row + 1] - m_row_offsets[row]; - } - - const int* get_column_indices() const - { - return thrust::raw_pointer_cast(m_column_indices.data()); - } - - int get_num_rows() const - { - return m_num_rows; - } - - int get_num_columns() const - { - return m_num_columns; - } - - int get_num_nonzeros() const - { - return m_num_nonzeros; - } - - void print_internals(std::ostream& out) const - { - out << (HostStorage ? "host" : "device") << "_csr_matrix" - << "(" << m_num_rows << ", " << m_num_columns << ")\n" - << " - num_elems: " << (m_num_rows * m_num_columns) << "\n" - << " - num_nonzero: " << m_num_nonzeros << "\n" - << " - row_offsets:\n ["; - print_vector(out, m_row_offsets); - out << "]\n" - << " - column_indices:\n ["; - print_vector(out, m_column_indices); - out << "]\n" - << " - values:\n ["; - print_vector(out, m_values); - out << "]\n"; - } - - void print_summary(std::ostream& out) const - { - const int num_elems = m_num_rows * m_num_columns; - const float fill_ratio = - num_elems == 0 ? 0.f : (static_cast(m_num_nonzeros) / static_cast(num_elems)); - - out << m_num_rows << "x" << m_num_columns << ", " << m_num_nonzeros << "/" << num_elems << " (" << fill_ratio - << ")\n"; - } - - friend class csr_matrix; - -private: - template - using vector_t = ::cuda::std::_If, c2h::device_vector>; - - vector_t m_values; - vector_t m_row_offsets; - vector_t m_column_indices; - - int m_num_rows{0}; - int m_num_columns{0}; - int m_num_nonzeros{0}; -}; - -//============================================================================== -// Convenience aliases for host/device csr_matrix types. -template -using host_csr_matrix = csr_matrix; - -template -using device_csr_matrix = csr_matrix; - -//============================================================================== -// Compare two floats within a tolerance. -// This mimics the approach used by Thrust's ASSERT_ALMOST_EQUAL checks. -template -struct fp_almost_equal_functor -{ - __host__ __device__ bool operator()(ValueT v1, ValueT v2) const - { - constexpr double r_tol = 1e-3; - constexpr double a_tol = 1e-2; - const double limit = r_tol * (std::fabs(v1) + std::fabs(v2)) + a_tol; - return std::fabs(v1 - v2) <= limit; - } -}; - -//============================================================================== -// Compare the reference and cub output vectors. -// Use fuzzy check for floating point values. -template -bool compare_results( - std::true_type /* is_fp */, const c2h::host_vector& h_vec1, const c2h::device_vector& d_vec2) -{ - c2h::device_vector d_vec1(h_vec1); - auto err = thrust::mismatch( - c2h::device_policy, d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin(), fp_almost_equal_functor{}); - if (err.first == d_vec1.cend() || err.second == d_vec2.cend()) - { - return true; - } - else - { - c2h::host_vector h_vec2(d_vec2); - const auto idx = thrust::distance(d_vec1.cbegin(), err.first); - std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs " - << print_cast(ValueT{h_vec2[idx]}) << std::endl; - return false; - } -}; - -template -bool compare_results( - std::false_type /* is_fp */, const c2h::host_vector& h_vec1, const c2h::device_vector& d_vec2) -{ - c2h::device_vector d_vec1(h_vec1); - auto err = thrust::mismatch(c2h::device_policy, d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin()); - if (err.first == d_vec1.cend() || err.second == d_vec2.cend()) - { - return true; - } - else - { - c2h::host_vector h_vec2(d_vec2); - const auto idx = thrust::distance(d_vec1.cbegin(), err.first); - std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs " - << print_cast(ValueT{h_vec2[idx]}) << std::endl; - return false; - } -} - -//============================================================================== -// Generate a random host_csr_matrix with the specified dimensions. -// target_fill_ratio is the target fraction of non-zero elements (may be more -// or less in the output). -template -host_csr_matrix make_random_csr_matrix(int num_rows, int num_cols, float target_fill_ratio) -{ - host_csr_matrix mat{num_rows, num_cols}; - - for (int row = 0; row < num_rows; ++row) - { - for (int col = 0; col < num_cols; ++col) - { - const bool is_non_zero = RandomValue(1.f) < target_fill_ratio; - if (!is_non_zero) - { - continue; - } - - if (std::is_floating_point::value) - { - // Keep fp numbers somewhat small, from -50 -> 50; otherwise we run - // into issues with nans/infs - ValueT value = (RandomValue(static_cast(100)) - static_cast(50)); - mat.append_value(row, col, value); - } - else - { - ValueT value{}; - InitValue(RANDOM, value); - mat.append_value(row, col, value); - } - } - } - - mat.finalize(); - - const int num_elements = num_rows * num_cols; - const float actual_fill_ratio = static_cast(mat.get_num_nonzeros()) / static_cast(num_elements); - - if (g_verbose) - { - printf( - "Created host_csr_matrix<%s>(%d, %d)\n" - " - NumElements: %d\n" - " - NumNonZero: %d\n" - " - Target fill: %0.2f%%\n" - " - Actual fill: %0.2f%%\n", - typeid(ValueT).name(), - num_rows, - num_cols, - num_elements, - mat.get_num_nonzeros(), - target_fill_ratio, - actual_fill_ratio); - } - - return mat; -} - -//============================================================================== -// Fill a vector with random values. -template -c2h::host_vector make_random_vector(int len) -{ - c2h::host_vector vec(len); - for (auto& val : vec) - { - if (std::is_floating_point::value) - { // Keep fp numbers somewhat small; otherwise we run into issues with - // nans/infs - val = RandomValue(static_cast(100)) - static_cast(50); - } - else - { - InitValue(RANDOM, val); - } - } - return vec; -} - -//============================================================================== -// Serial y = Ax computation -template -void compute_reference_solution( - const host_csr_matrix& a, const c2h::host_vector& x, c2h::host_vector& y) -{ - if (a.get_num_rows() == 0 || a.get_num_columns() == 0) - { - return; - } - - for (int row = 0; row < a.get_num_rows(); ++row) - { - const int row_offset = a.get_row_offset(row); - const int row_length = a.get_row_num_nonzero(row); - const int* cols = a.get_column_indices() + row_offset; - const int* cols_end = cols + row_length; - const ValueT* values = a.get_values() + row_offset; - - ValueT accum{}; - while (cols < cols_end) - { - accum += (*values++) * x[*cols++]; - } - y[row] = accum; - } -} - -//============================================================================== -// cub::DeviceSpmv::CsrMV y = Ax computation -template -void compute_cub_solution( - const device_csr_matrix& a, const c2h::device_vector& x, c2h::device_vector& y) -{ - c2h::device_vector temp_storage; - std::size_t temp_storage_bytes{}; - auto err = cub::DeviceSpmv::CsrMV( - nullptr, - temp_storage_bytes, - a.get_values(), - a.get_row_offsets(), - a.get_column_indices(), - thrust::raw_pointer_cast(x.data()), - thrust::raw_pointer_cast(y.data()), - a.get_num_rows(), - a.get_num_columns(), - a.get_num_nonzeros()); - CubDebugExit(err); - - temp_storage.resize(temp_storage_bytes); - - err = cub::DeviceSpmv::CsrMV( - thrust::raw_pointer_cast(temp_storage.data()), - temp_storage_bytes, - a.get_values(), - a.get_row_offsets(), - a.get_column_indices(), - thrust::raw_pointer_cast(x.data()), - thrust::raw_pointer_cast(y.data()), - a.get_num_rows(), - a.get_num_columns(), - a.get_num_nonzeros()); - CubDebugExit(err); -} - -//============================================================================== -// Compute y = Ax twice, one reference and one cub::DeviceSpmv, and compare the -// results. -template -void test_spmv(const host_csr_matrix& h_a, const c2h::host_vector& h_x) -{ - if (g_verbose) - { - std::cout << "Testing cub::DeviceSpmv on inputs:\n"; - h_a.print_internals(std::cout); - std::cout << "x vector:\n ["; - print_vector(std::cout, h_x); - std::cout << "]" << std::endl; - } - else - { - h_a.print_summary(std::cout); - } - - const device_csr_matrix d_a(h_a); - const c2h::device_vector d_x(h_x); - - c2h::host_vector h_y(h_a.get_num_rows()); - c2h::device_vector d_y(d_a.get_num_rows()); - - compute_reference_solution(h_a, h_x, h_y); - compute_cub_solution(d_a, d_x, d_y); - - if (g_verbose) - { - std::cout << "reference output:\n ["; - print_vector(std::cout, h_y); - std::cout << "]\n"; - c2h::host_vector tmp_y(d_y); - std::cout << "cub::DeviceSpmv output:\n ["; - print_vector(std::cout, tmp_y); - std::cout << "]" << std::endl; - } - - constexpr auto is_fp = std::is_floating_point{}; - AssertTrue(compare_results(is_fp, h_y, d_y)); -} - -//============================================================================== -// Test example from cub::DeviceSpmv documentation -template -void test_doc_example() -{ - std::cout << "\n\ntest_doc_example<" << typeid(ValueT).name() << ">()" << std::endl; - - host_csr_matrix h_a(9, 9); - h_a.append_value(0, 1, ValueT{1}); - h_a.append_value(0, 3, ValueT{1}); - h_a.append_value(1, 0, ValueT{1}); - h_a.append_value(1, 2, ValueT{1}); - h_a.append_value(1, 4, ValueT{1}); - h_a.append_value(2, 1, ValueT{1}); - h_a.append_value(2, 5, ValueT{1}); - h_a.append_value(3, 0, ValueT{1}); - h_a.append_value(3, 4, ValueT{1}); - h_a.append_value(3, 6, ValueT{1}); - h_a.append_value(4, 1, ValueT{1}); - h_a.append_value(4, 3, ValueT{1}); - h_a.append_value(4, 5, ValueT{1}); - h_a.append_value(4, 7, ValueT{1}); - h_a.append_value(5, 2, ValueT{1}); - h_a.append_value(5, 4, ValueT{1}); - h_a.append_value(5, 8, ValueT{1}); - h_a.append_value(6, 3, ValueT{1}); - h_a.append_value(6, 7, ValueT{1}); - h_a.append_value(7, 4, ValueT{1}); - h_a.append_value(7, 6, ValueT{1}); - h_a.append_value(7, 8, ValueT{1}); - h_a.append_value(8, 5, ValueT{1}); - h_a.append_value(8, 7, ValueT{1}); - h_a.finalize(); - - c2h::host_vector h_x(9, ValueT{1}); - - test_spmv(h_a, h_x); -} - -//============================================================================== -// Generate and test a random SpMV operation with the given parameters. -template -void test_random(int rows, int cols, float target_fill_ratio) -{ - std::cout << "\n\ntest_random<" << typeid(ValueT).name() << ">(" << rows << ", " << cols << ", " << target_fill_ratio - << ")" << std::endl; - - host_csr_matrix h_a = make_random_csr_matrix(rows, cols, target_fill_ratio); - c2h::host_vector h_x = make_random_vector(cols); - - test_spmv(h_a, h_x); -} - -//============================================================================== -// Dispatch many random SpMV tests over a variety of parameters. -template -void test_random() -{ - test_random(0, 0, 1.f); - test_random(0, 1, 1.f); - test_random(1, 0, 1.f); - - constexpr int dim_min = 1; - constexpr int dim_max = 10000; - - constexpr int max_num_elems = 100000; - - constexpr float ratio_min = 0.f; - constexpr float ratio_max = 1.1f; // a lil over to account for fp errors - constexpr float ratio_step = 0.3334f; - - for (int rows = dim_min; rows < dim_max; rows <<= 1) - { - for (int cols = dim_min; cols < dim_max; cols <<= 1) - { - if (rows * cols >= max_num_elems) - { - continue; - } - - for (float ratio = ratio_min; ratio < ratio_max; ratio += ratio_step) - { - test_random(rows, cols, ratio); - // Test nearby non-power-of-two dims: - test_random(rows + 97, cols + 83, ratio); - } - } - } -} - -//============================================================================== -// Dispatch many SpMV tests for a given ValueT. -template -void test_type() -{ - test_doc_example(); - test_random(); -} - -//============================================================================== -// Dispatch many SpMV tests over a variety of types. -void test_types() -{ - test_type(); - test_type(); - test_type(); - test_type(); - test_type(); -} - -int main(int argc, char** argv) -{ - // Initialize command line - CommandLineArgs args(argc, argv); - g_verbose = args.CheckCmdLineFlag("v"); - - // Print usage - if (args.CheckCmdLineFlag("help")) - { - printf("%s " - "[--device=] " - "[--v] verbose" - "\n", - argv[0]); - exit(0); - } - - CubDebugExit(args.DeviceInit()); - - test_types(); -} - -_CCCL_SUPPRESS_DEPRECATED_POP From b6dd111f7ab213d14c41c3cd96c7e819683541bf Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Thu, 30 Jan 2025 11:01:23 -0800 Subject: [PATCH 27/33] work around erroneous "undefined in device code" error in `basic_any` (#3614) --- .../__utility/basic_any/virtcall.cuh | 29 ++++++++++--------- .../__utility/basic_any/virtual_functions.cuh | 4 +-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/cudax/include/cuda/experimental/__utility/basic_any/virtcall.cuh b/cudax/include/cuda/experimental/__utility/basic_any/virtcall.cuh index a51b36d29c9..e090e4b023c 100644 --- a/cudax/include/cuda/experimental/__utility/basic_any/virtcall.cuh +++ b/cudax/include/cuda/experimental/__utility/basic_any/virtcall.cuh @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include @@ -60,19 +61,13 @@ namespace cuda::experimental //! except for the virtuals map, which substitutes the correct member function //! pointer for the user so they don't have to think about it. template -struct __virtuals_map_pair +struct __virtuals_map_element { // map ifoo<>::meow to itself - _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API constexpr auto operator()(__ctag<_Mbr>) const noexcept - { - return _Mbr; - } + auto operator()(__ctag<_Mbr>) const -> __virtual_fn<_Mbr>; // map ifoo<_Super>::meow to ifoo<>::meow - _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API constexpr auto operator()(__ctag<_BoundMbr>) const noexcept - { - return _Mbr; - } + auto operator()(__ctag<_BoundMbr>) const -> __virtual_fn<_Mbr>; }; template @@ -80,15 +75,23 @@ struct __virtuals_map; template struct __virtuals_map, overrides_for<_BoundInterface, _BoundMbrs...>> - : __virtuals_map_pair<_Mbrs, _BoundMbrs>... + : __virtuals_map_element<_Mbrs, _BoundMbrs>... { - using __virtuals_map_pair<_Mbrs, _BoundMbrs>::operator()...; + using __virtuals_map_element<_Mbrs, _BoundMbrs>::operator()...; }; template using __virtuals_map_for _CCCL_NODEBUG_ALIAS = __virtuals_map<__overrides_for<_Interface>, __overrides_for<__rebind_interface<_Interface, _Super>>>; +template +extern _CUDA_VSTD::__call_result_t<__virtuals_map_for<_Interface, _Super>, __ctag<_Mbr>> __virtual_fn_for_v; + +// This alias indirects through the above variable template to cache the result +// of the virtuals map lookup. +template +using __virtual_fn_for _CCCL_NODEBUG_ALIAS = decltype(__virtual_fn_for_v<_Mbr, _Interface, _Super>); + //! //! virtcall //! @@ -109,8 +112,8 @@ _CUDAX_HOST_API auto __virtcall(_Self* __self, _Args&&... __args) // auto* __vptr = __basic_any_access::__get_vptr(*__self)->__query_interface(_Interface()); auto* __obj = __basic_any_access::__get_optr(*__self); // map the member function pointer to the correct one if necessary - constexpr auto _Mbr2 = __virtuals_map_for<_Interface, _Super>{}(__ctag<_Mbr>()); - return __vptr->__virtual_fn<_Mbr2>::__fn_(__obj, static_cast<_Args&&>(__args)...); + using __virtual_fn_t = __virtual_fn_for<_Mbr, _Interface, _Super>; + return __vptr->__virtual_fn_t::__fn_(__obj, static_cast<_Args&&>(__args)...); } _CCCL_TEMPLATE(auto _Mbr, template class _Interface, class _Super, class... _Args) diff --git a/cudax/include/cuda/experimental/__utility/basic_any/virtual_functions.cuh b/cudax/include/cuda/experimental/__utility/basic_any/virtual_functions.cuh index 32d7a77ebba..f31d396b608 100644 --- a/cudax/include/cuda/experimental/__utility/basic_any/virtual_functions.cuh +++ b/cudax/include/cuda/experimental/__utility/basic_any/virtual_functions.cuh @@ -64,8 +64,8 @@ _CUDAX_TRIVIAL_API auto __c_style_cast(_Src* __ptr) noexcept -> _DstPtr } template -_CCCL_NODISCARD _CUDAX_API auto __override_fn_([[maybe_unused]] _CUDA_VSTD::__maybe_const<_IsConst, void>* __pv, - [[maybe_unused]] _Args... __args) noexcept(_IsNothrow) -> _Ret +_CCCL_NODISCARD _CUDAX_HOST_API auto __override_fn_([[maybe_unused]] _CUDA_VSTD::__maybe_const<_IsConst, void>* __pv, + [[maybe_unused]] _Args... __args) noexcept(_IsNothrow) -> _Ret { using __value_type _CCCL_NODEBUG_ALIAS = _CUDA_VSTD::__maybe_const<_IsConst, _Tp>; From 73db01e1bb352a12ee46930644783bede3d6e3ed Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Thu, 30 Jan 2025 11:04:09 -0800 Subject: [PATCH 28/33] Deprecate `AgentSegmentFixupPolicy` (#3593) --- cub/cub/agent/agent_segment_fixup.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh index 717b9b115e9..515b34d7c72 100644 --- a/cub/cub/agent/agent_segment_fixup.cuh +++ b/cub/cub/agent/agent_segment_fixup.cuh @@ -84,7 +84,8 @@ template -struct AgentSegmentFixupPolicy +struct CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " + "interface will be removed.") AgentSegmentFixupPolicy { enum { From 0ecae0387712c0d7fb615b3e42b5c3811965d2f3 Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Thu, 30 Jan 2025 11:46:28 -0800 Subject: [PATCH 29/33] Fix deadlocks by enabling eager module loading in libcudacxx tests. (#3585) * Try using eager loading in lit to fix barrier deadlock * Use override to limit testing * Revert "Use override to limit testing" This reverts commit 86edec483cbc74050db1d9364c494934d960a8eb. --- libcudacxx/test/utils/libcudacxx/test/config.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/libcudacxx/test/utils/libcudacxx/test/config.py b/libcudacxx/test/utils/libcudacxx/test/config.py index af90b9fcbec..dbf7a84b73d 100644 --- a/libcudacxx/test/utils/libcudacxx/test/config.py +++ b/libcudacxx/test/utils/libcudacxx/test/config.py @@ -78,6 +78,7 @@ def __init__(self, lit_config, config): self.link_shared = self.get_lit_bool("enable_shared", default=True) self.debug_build = self.get_lit_bool("debug_build", default=False) self.exec_env = dict(os.environ) + self.exec_env["CUDA_MODULE_LOADING"] = "EAGER" self.use_target = False self.use_system_cxx_lib = False self.use_clang_verify = False @@ -201,7 +202,8 @@ def get_modules_enabled(self): def make_static_lib_name(self, name): """Return the full filename for the specified library name""" if self.is_windows: - assert name == "c++" # Only allow libc++ to use this function for now. + # Only allow libc++ to use this function for now. + assert name == "c++" return "lib" + name + ".lib" else: return "lib" + name + ".a" @@ -308,7 +310,7 @@ def configure_cxx(self): cxx is not None and os.path.basename(cxx) == "clang-cl.exe" ) - ## Build CXXCompiler manually for NVRTCC + # Build CXXCompiler manually for NVRTCC if nvrtc is True: cxx_type = "nvrtcc" self.cxx = CXXCompiler( @@ -751,7 +753,7 @@ def configure_compile_flags(self): if compute_archs == "native": compute_archs = self.get_compute_capabilities() - compute_archs = set(sorted(re.split("\s|;|,", compute_archs))) + compute_archs = set(sorted(re.split("\\s|;|,", compute_archs))) for s in compute_archs: # Split arch and mode i.e. 80-virtual -> 80, virtual arch, *mode = re.split("-", s) From 51a890a86fdae2549c9e99f0b062ae954581aa0d Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 30 Jan 2025 21:26:22 +0100 Subject: [PATCH 30/33] Add b200 tunings for histogram (#3616) Co-authored-by: Giannis Gonidelis --- .../device/dispatch/dispatch_histogram.cuh | 42 +++- .../dispatch/tuning/tuning_histogram.cuh | 186 +++++++++++++++++- 2 files changed, 216 insertions(+), 12 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 2c2d0a2a9ca..43944dfc0b5 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -36,6 +36,8 @@ #include +#include + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -554,8 +556,7 @@ template , CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>> + typename PolicyHub = void> // if user passes a custom Policy this should not be void struct DispatchHistogram { static_assert(NUM_CHANNELS <= 4, "Histograms only support up to 4 channels"); @@ -920,8 +921,14 @@ public: cudaStream_t stream, Int2Type /*is_byte_sample*/) { - using MaxPolicyT = typename PolicyHub::MaxPolicy; - cudaError error = cudaSuccess; + // Should we call DispatchHistogram<....., PolicyHub=void> in DeviceHistogram? + static constexpr bool isEven = 0; + using fallback_policy_hub = detail::histogram:: + policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; + + using MaxPolicyT = + typename cuda::std::_If::value, fallback_policy_hub, PolicyHub>::MaxPolicy; + cudaError error = cudaSuccess; do { @@ -1091,8 +1098,13 @@ public: cudaStream_t stream, Int2Type /*is_byte_sample*/) { - using MaxPolicyT = typename PolicyHub::MaxPolicy; - cudaError error = cudaSuccess; + static constexpr bool isEven = 0; + using fallback_policy_hub = detail::histogram:: + policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; + + using MaxPolicyT = + typename cuda::std::_If::value, fallback_policy_hub, PolicyHub>::MaxPolicy; + cudaError error = cudaSuccess; do { @@ -1226,8 +1238,13 @@ public: cudaStream_t stream, Int2Type /*is_byte_sample*/) { - using MaxPolicyT = typename PolicyHub::MaxPolicy; - cudaError error = cudaSuccess; + static constexpr bool isEven = 1; + using fallback_policy_hub = detail::histogram:: + policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; + + using MaxPolicyT = + typename cuda::std::_If::value, fallback_policy_hub, PolicyHub>::MaxPolicy; + cudaError error = cudaSuccess; do { @@ -1412,8 +1429,13 @@ public: cudaStream_t stream, Int2Type /*is_byte_sample*/) { - using MaxPolicyT = typename PolicyHub::MaxPolicy; - cudaError error = cudaSuccess; + static constexpr bool isEven = 1; + using fallback_policy_hub = detail::histogram:: + policy_hub, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, isEven>; + + using MaxPolicyT = + typename cuda::std::_If::value, fallback_policy_hub, PolicyHub>::MaxPolicy; + cudaError error = cudaSuccess; do { diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index bd19489971e..3ae3f7fc58a 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -60,6 +60,8 @@ enum class sample_size { _1, _2, + _4, + _8, unknown }; @@ -125,7 +127,164 @@ struct sm90_tuning +template (), + sample_size SampleSize = classify_sample_size()> +struct sm100_tuning; + +// even +template +struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1> +{ + // ipt_12.tpb_928.rle_0.ws_0.mem_1.ld_2.laid_0.vec_2 1.033332 0.940517 1.031835 1.195876 + static constexpr int items = 12; + static constexpr int threads = 928; + static constexpr bool rle_compress = false; + static constexpr bool work_stealing = false; + static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; + static constexpr CacheLoadModifier load_modifier = LOAD_CA; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + static constexpr int tune_vec_size = 1 << 2; +}; + +// same as base +template +struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> + : sm90_tuning +{}; + +// same as base +template +struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> + : sm90_tuning +{}; + +// same as base +template +struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> + : sm90_tuning +{}; + +// range +template +struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1> +{ + // ipt_12.tpb_448.rle_0.ws_0.mem_1.ld_1.laid_0.vec_2 1.078987 0.985542 1.085118 1.175637 + static constexpr int items = 12; + static constexpr int threads = 448; + static constexpr bool rle_compress = false; + static constexpr bool work_stealing = false; + static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; + static constexpr CacheLoadModifier load_modifier = LOAD_LDG; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + static constexpr int tune_vec_size = 1 << 2; +}; + +// same as base +template +struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> + : sm90_tuning +{}; + +template +struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> +{ + // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104 + static constexpr int items = 9; + static constexpr int threads = 1024; + static constexpr bool rle_compress = true; + static constexpr bool work_stealing = false; + static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; + static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + static constexpr int tune_vec_size = 1 << 0; +}; + +template +struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> +{ + // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657 + static constexpr int items = 7; + static constexpr int threads = 544; + static constexpr bool rle_compress = true; + static constexpr bool work_stealing = false; + static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; + static constexpr CacheLoadModifier load_modifier = LOAD_LDG; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + static constexpr int tune_vec_size = 1 << 0; +}; + +// multi.even +template +struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1> +{ + // ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504 + static constexpr int items = 9; + static constexpr int threads = 1024; + static constexpr bool rle_compress = false; + static constexpr bool work_stealing = false; + static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; + static constexpr CacheLoadModifier load_modifier = LOAD_LDG; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + static constexpr int tune_vec_size = 1 << 0; +}; + +// same as base +template +struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> + : sm90_tuning +{}; + +// same as base +template +struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> + : sm90_tuning +{}; + +// same as base +template +struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> + : sm90_tuning +{}; + +// multi.range +template +struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1> +{ + // ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584 + static constexpr int items = 7; + static constexpr int threads = 160; + static constexpr bool rle_compress = false; + static constexpr bool work_stealing = false; + static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; + static constexpr CacheLoadModifier load_modifier = LOAD_LDG; + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + static constexpr int tune_vec_size = 1 << 1; +}; + +// same as base +template +struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> + : sm90_tuning +{}; + +// same as base +template +struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> + : sm90_tuning +{}; + +// same as base +template +struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> + : sm90_tuning +{}; + +template struct policy_hub { // TODO(bgruber): move inside t_scale in C++14 @@ -166,7 +325,30 @@ struct policy_hub sm90_tuning()>>(0)); }; - using MaxPolicy = Policy900; + struct Policy1000 : ChainedPolicy<1000, Policy1000, Policy900> + { + // Use values from tuning if a specialization exists, otherwise pick Policy900 + template + static auto select_agent_policy(int) + -> AgentHistogramPolicy; + + template + static auto select_agent_policy(long) -> typename Policy900::AgentHistogramPolicyT; + + using AgentHistogramPolicyT = + decltype(select_agent_policy< + sm100_tuning()>>( + 0)); + }; + + using MaxPolicy = Policy1000; }; } // namespace histogram } // namespace detail From 0ce59c7f3eb8b6d92fee4c88065dacebe2864b87 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Thu, 30 Jan 2025 12:34:04 -0800 Subject: [PATCH 31/33] make `uninitialized[_async]_buffer`'s range accessors const-correct (#3615) --- .../uninitialized_async_buffer.cuh | 36 ++++++++++++++----- .../__container/uninitialized_buffer.cuh | 36 ++++++++++++++----- .../containers/uninitialized_async_buffer.cu | 6 ++++ cudax/test/containers/uninitialized_buffer.cu | 6 ++++ 4 files changed, 68 insertions(+), 16 deletions(-) diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index 3f55084dc63..e66b7bf2ac3 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -108,7 +108,7 @@ private: size_t __space = __get_allocation_size(__count_); void* __ptr = __buf_; return _CUDA_VSTD::launder( - reinterpret_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space))); + static_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space))); } //! @brief Causes the buffer to be treated as a span when passed to cudax::launch. @@ -136,10 +136,12 @@ private: } public: - using value_type = _Tp; - using reference = _Tp&; - using pointer = _Tp*; - using size_type = size_t; + using value_type = _Tp; + using reference = _Tp&; + using const_reference = const _Tp&; + using pointer = _Tp*; + using const_pointer = const _Tp*; + using size_type = size_t; //! @brief Constructs an \c uninitialized_async_buffer, allocating sufficient storage for \p __count elements through //! \p __mr @@ -215,20 +217,38 @@ public: } //! @brief Returns an aligned pointer to the first element in the buffer - _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr pointer begin() const noexcept + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr pointer begin() noexcept + { + return __get_data(); + } + + //! @overload + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr const_pointer begin() const noexcept { return __get_data(); } //! @brief Returns an aligned pointer to the element following the last element of the buffer. //! This element acts as a placeholder; attempting to access it results in undefined behavior. - _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr pointer end() const noexcept + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr pointer end() noexcept + { + return __get_data() + __count_; + } + + //! @overload + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr const_pointer end() const noexcept { return __get_data() + __count_; } //! @brief Returns an aligned pointer to the first element in the buffer - _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr pointer data() const noexcept + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr pointer data() noexcept + { + return __get_data(); + } + + //! @overload + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr const_pointer data() const noexcept { return __get_data(); } diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh index 55168b38805..edf17e70865 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh @@ -98,7 +98,7 @@ private: size_t __space = __get_allocation_size(__count_); void* __ptr = __buf_; return _CUDA_VSTD::launder( - reinterpret_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space))); + static_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space))); } //! @brief Causes the buffer to be treated as a span when passed to cudax::launch. @@ -124,10 +124,12 @@ private: } public: - using value_type = _Tp; - using reference = _Tp&; - using pointer = _Tp*; - using size_type = size_t; + using value_type = _Tp; + using reference = _Tp&; + using const_reference = const _Tp&; + using pointer = _Tp*; + using const_pointer = const _Tp*; + using size_type = size_t; //! @brief Constructs an \c uninitialized_buffer and allocates sufficient storage for \p __count elements through //! \p __mr @@ -198,20 +200,38 @@ public: } //! @brief Returns an aligned pointer to the first element in the buffer - _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI pointer begin() const noexcept + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI pointer begin() noexcept + { + return __get_data(); + } + + //! @overload + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI const_pointer begin() const noexcept { return __get_data(); } //! @brief Returns an aligned pointer to the element following the last element of the buffer. //! This element acts as a placeholder; attempting to access it results in undefined behavior. - _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI pointer end() const noexcept + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI pointer end() noexcept + { + return __get_data() + __count_; + } + + //! @overload + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI const_pointer end() const noexcept { return __get_data() + __count_; } //! @brief Returns an aligned pointer to the first element in the buffer - _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI pointer data() const noexcept + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI pointer data() noexcept + { + return __get_data(); + } + + //! @overload + _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI const_pointer data() const noexcept { return __get_data(); } diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu index 392f5fb2944..4e57bf7c1c7 100644 --- a/cudax/test/containers/uninitialized_async_buffer.cu +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -137,6 +137,9 @@ TEMPLATE_TEST_CASE( SECTION("access") { uninitialized_async_buffer buf{resource, stream, 42}; + static_assert(cuda::std::is_same::value, ""); + static_assert(cuda::std::is_same::value, ""); + static_assert(cuda::std::is_same::value, ""); CUDAX_CHECK(buf.data() != nullptr); CUDAX_CHECK(buf.size() == 42); CUDAX_CHECK(buf.size_bytes() == 42 * sizeof(TestType)); @@ -145,6 +148,9 @@ TEMPLATE_TEST_CASE( CUDAX_CHECK(buf.get_stream() == stream); CUDAX_CHECK(buf.get_memory_resource() == resource); + static_assert(cuda::std::is_same::value, ""); + static_assert(cuda::std::is_same::value, ""); + static_assert(cuda::std::is_same::value, ""); CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr); CUDAX_CHECK(cuda::std::as_const(buf).size() == 42); CUDAX_CHECK(cuda::std::as_const(buf).size_bytes() == 42 * sizeof(TestType)); diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu index 3e5c48c0eff..1aa8f467d8a 100644 --- a/cudax/test/containers/uninitialized_buffer.cu +++ b/cudax/test/containers/uninitialized_buffer.cu @@ -155,6 +155,9 @@ TEMPLATE_TEST_CASE( SECTION("access") { uninitialized_buffer buf{resource, 42}; + static_assert(cuda::std::is_same::value, ""); + static_assert(cuda::std::is_same::value, ""); + static_assert(cuda::std::is_same::value, ""); CUDAX_CHECK(buf.data() != nullptr); CUDAX_CHECK(buf.size() == 42); CUDAX_CHECK(buf.size_bytes() == 42 * sizeof(TestType)); @@ -162,6 +165,9 @@ TEMPLATE_TEST_CASE( CUDAX_CHECK(buf.end() == buf.begin() + buf.size()); CUDAX_CHECK(buf.get_memory_resource() == resource); + static_assert(cuda::std::is_same::value, ""); + static_assert(cuda::std::is_same::value, ""); + static_assert(cuda::std::is_same::value, ""); CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr); CUDAX_CHECK(cuda::std::as_const(buf).size() == 42); CUDAX_CHECK(cuda::std::as_const(buf).begin() == buf.data()); From 77a6a45c7ea6e46bf127cfe466ce973c5a675a87 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Thu, 30 Jan 2025 12:51:37 -0800 Subject: [PATCH 32/33] Remove `LEGACY_PTX_ARCH` (#3551) --- cub/cub/agent/agent_batch_memcpy.cuh | 2 -- cub/cub/agent/agent_histogram.cuh | 12 +++------- cub/cub/agent/agent_reduce_by_key.cuh | 2 +- cub/cub/agent/agent_rle.cuh | 2 +- cub/cub/agent/agent_scan.cuh | 2 +- cub/cub/agent/agent_scan_by_key.cuh | 2 +- cub/cub/agent/agent_select_if.cuh | 2 +- cub/cub/agent/agent_three_way_partition.cuh | 2 +- cub/cub/agent/agent_unique_by_key.cuh | 5 ++--- cub/cub/agent/single_pass_scan_operators.cuh | 1 - cub/cub/block/block_adjacent_difference.cuh | 2 +- cub/cub/block/block_discontinuity.cuh | 4 +--- cub/cub/block/block_exchange.cuh | 5 +---- cub/cub/block/block_histogram.cuh | 5 +---- cub/cub/block/block_load.cuh | 5 +---- cub/cub/block/block_radix_rank.cuh | 8 ++----- cub/cub/block/block_radix_sort.cuh | 5 +---- cub/cub/block/block_raking_layout.cuh | 4 +--- cub/cub/block/block_reduce.cuh | 5 +---- cub/cub/block/block_scan.cuh | 5 +---- cub/cub/block/block_shuffle.cuh | 4 +--- cub/cub/block/block_store.cuh | 5 +---- .../specializations/block_histogram_sort.cuh | 22 +++---------------- .../specializations/block_reduce_raking.cuh | 9 +++----- .../block_reduce_raking_commutative_only.cuh | 9 +++----- .../block_reduce_warp_reductions.cuh | 9 +++----- .../specializations/block_scan_raking.cuh | 9 +++----- .../specializations/block_scan_warp_scans.cuh | 9 +++----- cub/cub/util_ptx.cuh | 2 +- .../warp/specializations/warp_reduce_shfl.cuh | 9 +++----- .../warp/specializations/warp_reduce_smem.cuh | 9 +++----- .../warp/specializations/warp_scan_shfl.cuh | 9 +++----- .../warp/specializations/warp_scan_smem.cuh | 9 +++----- cub/cub/warp/warp_exchange.cuh | 4 ---- cub/cub/warp/warp_load.cuh | 5 +---- cub/cub/warp/warp_merge_sort.cuh | 9 +------- cub/cub/warp/warp_reduce.cuh | 8 +++---- cub/cub/warp/warp_scan.cuh | 4 +--- cub/cub/warp/warp_store.cuh | 5 +---- cub/test/catch2_test_warp_exchange.cuh | 12 +++++----- docs/cub/developer_overview.rst | 18 ++++++--------- thrust/thrust/system/cuda/detail/core/util.h | 9 +------- thrust/thrust/system/cuda/detail/reduce.h | 2 +- .../thrust/system/cuda/detail/reduce_by_key.h | 8 +++---- .../system/cuda/detail/set_operations.h | 4 ++-- thrust/thrust/system/cuda/detail/unique.h | 6 ++--- 46 files changed, 85 insertions(+), 203 deletions(-) diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh index 2b926f582fe..c2cf936bd87 100644 --- a/cub/cub/agent/agent_batch_memcpy.cuh +++ b/cub/cub/agent/agent_batch_memcpy.cuh @@ -642,14 +642,12 @@ private: TilePrefixCallbackOp, BLevBufferOffsetTileState, - 0, typename AgentMemcpySmallBuffersPolicyT::buff_delay_constructor>; using BLevBlockScanPrefixCallbackOpT = TilePrefixCallbackOp, BLevBlockOffsetTileState, - 0, typename AgentMemcpySmallBuffersPolicyT::block_delay_constructor>; //----------------------------------------------------------------------------- diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index 2e98bf76771..400d1778b11 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -172,9 +172,6 @@ namespace histogram * * @tparam OffsetT * Signed integer type for global offsets - * - * @tparam LEGACY_PTX_ARCH - * PTX compute capability (unused) */ template + typename OffsetT> struct AgentHistogram { //--------------------------------------------------------------------- @@ -930,8 +926,7 @@ template + typename OffsetT> using AgentHistogram CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and the public " "interface will be removed.") = detail::histogram::AgentHistogram< @@ -943,7 +938,6 @@ using AgentHistogram CCCL_DEPRECATED_BECAUSE("This class is considered an implem CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, - OffsetT, - LEGACY_PTX_ARCH>; + OffsetT>; CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh index a90399f4325..fffa5a88e57 100644 --- a/cub/cub/agent/agent_reduce_by_key.cuh +++ b/cub/cub/agent/agent_reduce_by_key.cuh @@ -276,7 +276,7 @@ struct AgentReduceByKey // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename AgentReduceByKeyPolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = - TilePrefixCallbackOp; + TilePrefixCallbackOp; // Key and value exchange types using KeyExchangeT = KeyOutputT[TILE_ITEMS + 1]; diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh index 2ea0729db92..fabc0b721ae 100644 --- a/cub/cub/agent/agent_rle.cuh +++ b/cub/cub/agent/agent_rle.cuh @@ -258,7 +258,7 @@ struct AgentRle // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename AgentRlePolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = - TilePrefixCallbackOp; + TilePrefixCallbackOp; // Warp exchange types using WarpExchangePairs = WarpExchange; diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh index c3cc02b69a1..9f29615a5cd 100644 --- a/cub/cub/agent/agent_scan.cuh +++ b/cub/cub/agent/agent_scan.cuh @@ -201,7 +201,7 @@ struct AgentScan // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename AgentScanPolicyT::detail::delay_constructor_t; - using TilePrefixCallbackOpT = TilePrefixCallbackOp; + using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Stateful BlockScan prefix callback type for managing a running total while // scanning consecutive tiles diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh index 722a44ac074..161a8a5c237 100644 --- a/cub/cub/agent/agent_scan_by_key.cuh +++ b/cub/cub/agent/agent_scan_by_key.cuh @@ -179,7 +179,7 @@ struct AgentScanByKey using DelayConstructorT = typename AgentScanByKeyPolicyT::detail::delay_constructor_t; using TilePrefixCallbackT = - TilePrefixCallbackOp; + TilePrefixCallbackOp; using BlockScanT = BlockScan; diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 37e7b838adf..b1785651f12 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -274,7 +274,7 @@ struct AgentSelectIf // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename AgentSelectIfPolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = - TilePrefixCallbackOp, MemoryOrderedTileStateT, 0, DelayConstructorT>; + TilePrefixCallbackOp, MemoryOrderedTileStateT, DelayConstructorT>; // Item exchange type using ItemExchangeT = InputT[TILE_ITEMS]; diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh index 047861254ac..f36151f916f 100644 --- a/cub/cub/agent/agent_three_way_partition.cuh +++ b/cub/cub/agent/agent_three_way_partition.cuh @@ -207,7 +207,7 @@ struct AgentThreeWayPartition // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename PolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = - cub::TilePrefixCallbackOp, ScanTileStateT, 0, DelayConstructorT>; + cub::TilePrefixCallbackOp, ScanTileStateT, DelayConstructorT>; // Item exchange type using ItemExchangeT = InputT[TILE_ITEMS]; diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh index a1a731f150f..52ca1d9b3a2 100644 --- a/cub/cub/agent/agent_unique_by_key.cuh +++ b/cub/cub/agent/agent_unique_by_key.cuh @@ -179,9 +179,8 @@ struct AgentUniqueByKey using BlockScanT = cub::BlockScan; // Parameterized BlockDiscontinuity type for items - using DelayConstructorT = typename AgentUniqueByKeyPolicyT::detail::delay_constructor_t; - using TilePrefixCallback = - cub::TilePrefixCallbackOp, ScanTileStateT, 0, DelayConstructorT>; + using DelayConstructorT = typename AgentUniqueByKeyPolicyT::detail::delay_constructor_t; + using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileStateT, DelayConstructorT>; // Key exchange type using KeyExchangeT = KeyT[ITEMS_PER_TILE]; diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index bd6551b8f8d..98769aa7791 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -1170,7 +1170,6 @@ struct ReduceByKeyScanTileState template > struct TilePrefixCallbackOp { diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh index 38636571e80..119ca4f328e 100644 --- a/cub/cub/block/block_adjacent_difference.cuh +++ b/cub/cub/block/block_adjacent_difference.cuh @@ -122,7 +122,7 @@ CUB_NAMESPACE_BEGIN //! ``{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``. //! //! @endrst -template +template class BlockAdjacentDifference { private: diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh index e4998f32510..c175ac96458 100644 --- a/cub/cub/block/block_discontinuity.cuh +++ b/cub/cub/block/block_discontinuity.cuh @@ -122,9 +122,7 @@ CUB_NAMESPACE_BEGIN //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused -template +template class BlockDiscontinuity { private: diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh index d1ae91c223d..402c60fe5a4 100644 --- a/cub/cub/block/block_exchange.cuh +++ b/cub/cub/block/block_exchange.cuh @@ -137,15 +137,12 @@ CUB_NAMESPACE_BEGIN //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! [optional] Unused. template + int BLOCK_DIM_Z = 1> class BlockExchange { static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; ///< The thread block size in threads diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh index 41abbd588b3..5ebd5c9371d 100644 --- a/cub/cub/block/block_histogram.cuh +++ b/cub/cub/block/block_histogram.cuh @@ -179,16 +179,13 @@ enum BlockHistogramAlgorithm //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. template + int BLOCK_DIM_Z = 1> class BlockHistogram { private: diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index c1e9b95ac56..f4a693f4750 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -790,15 +790,12 @@ enum BlockLoadAlgorithm //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. template + int BLOCK_DIM_Z = 1> class BlockLoad { static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; // total threads in the block diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index ad495e1db31..6a899b1440a 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -204,8 +204,6 @@ struct warp_in_block_matcher_t //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. template + int BLOCK_DIM_Z = 1> class BlockRadixRank { private: @@ -560,8 +557,7 @@ template + int BLOCK_DIM_Z = 1> class BlockRadixRankMatch { private: diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index 080053348d7..55dd8747ee4 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -238,8 +238,6 @@ CUB_NAMESPACE_BEGIN //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused template + int BLOCK_DIM_Z = 1> class BlockRadixSort { private: diff --git a/cub/cub/block/block_raking_layout.cuh b/cub/cub/block/block_raking_layout.cuh index 4d675b626b8..8f7f8b138c0 100644 --- a/cub/cub/block/block_raking_layout.cuh +++ b/cub/cub/block/block_raking_layout.cuh @@ -68,9 +68,7 @@ CUB_NAMESPACE_BEGIN //! @tparam BLOCK_THREADS //! The thread block size in threads. //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. -template +template struct BlockRakingLayout { //--------------------------------------------------------------------- diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh index 6cf578963fc..356134d3b40 100644 --- a/cub/cub/block/block_reduce.cuh +++ b/cub/cub/block/block_reduce.cuh @@ -232,14 +232,11 @@ enum BlockReduceAlgorithm //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. template + int BLOCK_DIM_Z = 1> class BlockReduce { private: diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index c25bd2d258d..de019116956 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -221,14 +221,11 @@ enum BlockScanAlgorithm //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. template + int BLOCK_DIM_Z = 1> class BlockScan { private: diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh index 93d8715c63b..0cb42eba3a0 100644 --- a/cub/cub/block/block_shuffle.cuh +++ b/cub/cub/block/block_shuffle.cuh @@ -73,9 +73,7 @@ CUB_NAMESPACE_BEGIN //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused -template +template class BlockShuffle { private: diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index e207a1d76c1..a2cd74fcd90 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -639,15 +639,12 @@ enum BlockStoreAlgorithm //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. template + int BLOCK_DIM_Z = 1> class BlockStore { private: diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh index 127f30953b2..b5e0f7beae2 100644 --- a/cub/cub/block/specializations/block_histogram_sort.cuh +++ b/cub/cub/block/specializations/block_histogram_sort.cuh @@ -72,17 +72,8 @@ namespace detail * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective (unused) */ -template +template struct BlockHistogramSort { /// Constants @@ -246,16 +237,9 @@ struct BlockHistogramSort }; } // namespace detail -template +template using BlockHistogramSort CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = - detail::BlockHistogramSort; + "removed.") = detail::BlockHistogramSort; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh index 90f8f12236f..a45a16f6e0d 100644 --- a/cub/cub/block/specializations/block_reduce_raking.cuh +++ b/cub/cub/block/specializations/block_reduce_raking.cuh @@ -77,11 +77,8 @@ namespace detail * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective */ -template +template struct BlockReduceRaking { /// Constants @@ -260,9 +257,9 @@ struct BlockReduceRaking }; } // namespace detail -template +template using BlockReduceRaking CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = detail::BlockReduceRaking; + "removed.") = detail::BlockReduceRaking; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh index 7841db5f18a..28ff55b5fe0 100644 --- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -68,11 +68,8 @@ namespace detail * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective */ -template +template struct BlockReduceRakingCommutativeOnly { /// Constants @@ -234,9 +231,9 @@ struct BlockReduceRakingCommutativeOnly }; } // namespace detail -template +template using BlockReduceRakingCommutativeOnly CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = detail::BlockReduceRakingCommutativeOnly; + "removed.") = detail::BlockReduceRakingCommutativeOnly; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh index 2dfa526771f..b6e70248b1e 100644 --- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -67,11 +67,8 @@ namespace detail * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective */ -template +template struct BlockReduceWarpReductions { /// Constants @@ -259,9 +256,9 @@ struct BlockReduceWarpReductions }; } // namespace detail -template +template using BlockReduceWarpReductions CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = detail::BlockReduceWarpReductions; + "removed.") = detail::BlockReduceWarpReductions; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh index 2af4b8693fc..26d9d949226 100644 --- a/cub/cub/block/specializations/block_scan_raking.cuh +++ b/cub/cub/block/specializations/block_scan_raking.cuh @@ -73,11 +73,8 @@ namespace detail * @tparam MEMOIZE * Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the * expense of higher register pressure - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective */ -template +template struct BlockScanRaking { //--------------------------------------------------------------------- @@ -797,9 +794,9 @@ struct BlockScanRaking }; } // namespace detail -template +template using BlockScanRaking CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = detail::BlockScanRaking; + "removed.") = detail::BlockScanRaking; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh index d034d2838ea..4fc74b423ce 100644 --- a/cub/cub/block/specializations/block_scan_warp_scans.cuh +++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh @@ -64,11 +64,8 @@ namespace detail * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective */ -template +template struct BlockScanWarpScans { //--------------------------------------------------------------------- @@ -539,9 +536,9 @@ struct BlockScanWarpScans } }; } // namespace detail -template +template using BlockScanWarpScans CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = detail::BlockScanWarpScans; + "removed.") = detail::BlockScanWarpScans; CUB_NAMESPACE_END diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index e6bb45c4a31..8e37c287109 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -384,7 +384,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId() * hardware warp threads). * @param warp_id Id of virtual warp within architectural warp */ -template +template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id) { constexpr bool is_pow_of_two = PowerOfTwo::VALUE; diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index 8c4ad78d1ad..3592e7c920a 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -92,11 +92,8 @@ struct reduce_max_exists : ::cu * * @tparam LOGICAL_WARP_THREADS * Number of threads per logical warp (must be a power-of-two) - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective */ -template +template struct WarpReduceShfl { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); @@ -739,9 +736,9 @@ struct WarpReduceShfl }; } // namespace detail -template +template using WarpReduceShfl CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = detail::WarpReduceShfl; + "removed.") = detail::WarpReduceShfl; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh index ade195ee6cb..b4e509d6766 100644 --- a/cub/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh @@ -63,11 +63,8 @@ namespace detail * * @tparam LOGICAL_WARP_THREADS * Number of threads per logical warp - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective */ -template +template struct WarpReduceSmem { /****************************************************************************** @@ -414,8 +411,8 @@ struct WarpReduceSmem }; } // namespace detail -template +template using WarpReduceSmem CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = detail::WarpReduceSmem; + "removed.") = detail::WarpReduceSmem; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh index 402b476c4e4..4b3b115b266 100644 --- a/cub/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh @@ -62,11 +62,8 @@ namespace detail * * @tparam LOGICAL_WARP_THREADS * Number of threads per logical warp (must be a power-of-two) - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective */ -template +template struct WarpScanShfl { //--------------------------------------------------------------------- @@ -677,9 +674,9 @@ struct WarpScanShfl }; } // namespace detail -template +template using WarpScanShfl CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = detail::WarpScanShfl; + "removed.") = detail::WarpScanShfl; CUB_NAMESPACE_END diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh index 090f0f96cb5..e6d18fb561f 100644 --- a/cub/cub/warp/specializations/warp_scan_smem.cuh +++ b/cub/cub/warp/specializations/warp_scan_smem.cuh @@ -63,11 +63,8 @@ namespace detail * * @tparam LOGICAL_WARP_THREADS * Number of threads per logical warp - * - * @tparam LEGACY_PTX_ARCH - * The PTX compute capability for which to to specialize this collective */ -template +template struct WarpScanSmem { /****************************************************************************** @@ -435,9 +432,9 @@ struct WarpScanSmem }; } // namespace detail -template +template using WarpScanSmem CCCL_DEPRECATED_BECAUSE( "This class is considered an implementation detail and the public interface will be " - "removed.") = detail::WarpScanSmem; + "removed.") = detail::WarpScanSmem; CUB_NAMESPACE_END diff --git a/cub/cub/warp/warp_exchange.cuh b/cub/cub/warp/warp_exchange.cuh index 79f422f5abe..7ce5997a446 100644 --- a/cub/cub/warp/warp_exchange.cuh +++ b/cub/cub/warp/warp_exchange.cuh @@ -83,9 +83,6 @@ using InternalWarpExchangeImpl = * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * - * @tparam LEGACY_PTX_ARCH - * Unused. - * * @par Overview * - It is commonplace for a warp of threads to rearrange data items between * threads. For example, the global memory accesses prefer patterns where @@ -139,7 +136,6 @@ using InternalWarpExchangeImpl = template class WarpExchange : private detail::InternalWarpExchangeImpl diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh index 3f11129c35a..b945a5355b2 100644 --- a/cub/cub/warp/warp_load.cuh +++ b/cub/cub/warp/warp_load.cuh @@ -216,13 +216,10 @@ enum WarpLoadAlgorithm //! targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a //! power of two. //! -//! @tparam LEGACY_PTX_ARCH -//! Unused. template + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS> class WarpLoad { static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); diff --git a/cub/cub/warp/warp_merge_sort.cuh b/cub/cub/warp/warp_merge_sort.cuh index de3d311ae59..447dc4d00c2 100644 --- a/cub/cub/warp/warp_merge_sort.cuh +++ b/cub/cub/warp/warp_merge_sort.cuh @@ -122,14 +122,7 @@ CUB_NAMESPACE_BEGIN //! [optional] Value type (default: cub::NullType, which indicates a //! keys-only sort) //! -//! @tparam LEGACY_PTX_ARCH -//! Unused. -//! -template +template class WarpMergeSort : public BlockMergeSortStrategy[optional] Unused. -template +template class WarpReduce { private: @@ -663,8 +661,8 @@ public: }; #ifndef _CCCL_DOXYGEN_INVOKED // Do not document -template -class WarpReduce +template +class WarpReduce { private: using _TempStorage = cub::NullType; diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh index 6eb6a35562b..e1c07c82691 100644 --- a/cub/cub/warp/warp_scan.cuh +++ b/cub/cub/warp/warp_scan.cuh @@ -156,9 +156,7 @@ CUB_NAMESPACE_BEGIN //! hardware warp threads). Default is the warp size associated with the CUDA Compute Capability //! targeted by the compiler (e.g., 32 threads for SM20). //! -//! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. -template +template class WarpScan { private: diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh index f0a9929e24f..a7ccb899607 100644 --- a/cub/cub/warp/warp_store.cuh +++ b/cub/cub/warp/warp_store.cuh @@ -223,13 +223,10 @@ enum WarpStoreAlgorithm //! targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a //! power of two. //! -//! @tparam LEGACY_PTX_ARCH -//! Unused. template + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS> class WarpStore { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); diff --git a/cub/test/catch2_test_warp_exchange.cuh b/cub/test/catch2_test_warp_exchange.cuh index 4b3b13563c0..e240abc7f48 100644 --- a/cub/test/catch2_test_warp_exchange.cuh +++ b/cub/test/catch2_test_warp_exchange.cuh @@ -53,7 +53,7 @@ struct exchange_data_t inline __device__ void - scatter(cub::WarpExchange& exchange, int (&ranks)[ItemsPerThread]) + scatter(cub::WarpExchange& exchange, int (&ranks)[ItemsPerThread]) { exchange.ScatterToStriped(input, ranks); } @@ -71,7 +71,7 @@ struct exchange_data_t inline __device__ void - scatter(cub::WarpExchange& exchange, int (&ranks)[ItemsPerThread]) + scatter(cub::WarpExchange& exchange, int (&ranks)[ItemsPerThread]) { exchange.ScatterToStriped(input, output, ranks); } @@ -85,7 +85,7 @@ template __global__ void scatter_kernel(const InputT* input_data, OutputT* output_data) { - using warp_exchange_t = cub::WarpExchange; + using warp_exchange_t = cub::WarpExchange; using storage_t = typename warp_exchange_t::TempStorage; constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS; @@ -147,7 +147,7 @@ template __global__ void kernel(const InputT* input_data, OutputT* output_data, ActionT action) { - using warp_exchange_t = cub::WarpExchange; + using warp_exchange_t = cub::WarpExchange; using storage_t = typename warp_exchange_t::TempStorage; constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS; @@ -205,7 +205,7 @@ struct blocked_to_striped cub::WarpExchangeAlgorithm Alg> __device__ void operator()(InputT (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], - cub::WarpExchange& exchange) + cub::WarpExchange& exchange) { exchange.BlockedToStriped(input, output); } @@ -221,7 +221,7 @@ struct striped_to_blocked cub::WarpExchangeAlgorithm Alg> __device__ void operator()(InputT (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], - cub::WarpExchange& exchange) + cub::WarpExchange& exchange) { exchange.StripedToBlocked(input, output); } diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst index 8b31dab6283..29f02995ac4 100644 --- a/docs/cub/developer_overview.rst +++ b/docs/cub/developer_overview.rst @@ -157,8 +157,7 @@ For example, :cpp:struct:`cub::WarpReduce` is a class template: .. code-block:: c++ template + int LOGICAL_WARP_THREADS = 32> class WarpReduce { // ... // (1) define `_TempStorage` type @@ -193,10 +192,6 @@ There is a vital difference in the behavior of warp-level algorithms that depend .. TODO: Add diagram showing non-power of two logical warps. -It's important to note that ``LEGACY_PTX_ARCH`` has been recently deprecated. -This parameter used to affect specialization selection (see below). -It was conflicting with the PTX dispatch refactoring and limited NVHPC support. - Temporary storage usage ==================================== @@ -258,13 +253,15 @@ and algorithm implementation look like: .Reduce(input, valid_items, ::cuda::std::plus<>{}); } -Due to ``LEGACY_PTX_ARCH`` issues described above, -we can't specialize on the PTX version. + + +``__CUDA_ARCH__`` cannot be used because it is conflicting with the PTX dispatch refactoring and limited NVHPC support. +Due to this limitation, we can't specialize on the PTX version. ``NV_IF_TARGET`` shall be used by specializations instead: .. code-block:: c++ - template + template struct WarpReduceShfl { @@ -314,8 +311,7 @@ Block-scope algorithms are provided by structures as well: int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int LEGACY_PTX_ARCH = 0> + int BLOCK_DIM_Z = 1> class BlockReduce { public: struct TempStorage : Uninitialized<_TempStorage> {}; diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index b3bdcf1f086..1b11f459c71 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -488,14 +488,7 @@ struct get_arch> template ::value_type> struct BlockLoad { - using type = - cub::BlockLoad::type::ver>; + using type = cub::BlockLoad; }; // cuda_optional diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h index 61ec2086adf..8ef245dc082 100644 --- a/thrust/thrust/system/cuda/detail/reduce.h +++ b/thrust/thrust/system/cuda/detail/reduce.h @@ -156,7 +156,7 @@ struct ReduceAgent using Vector = typename cub::CubVector; using LoadIt = typename core::detail::LoadIterator::type; - using BlockReduce = cub::BlockReduce; + using BlockReduce = cub::BlockReduce; using VectorLoadIt = cub::CacheModifiedInputIterator; diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h index 8c1db436085..fc6ceefa21b 100644 --- a/thrust/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h @@ -169,12 +169,10 @@ struct ReduceByKeyAgent using BlockLoadKeys = typename core::detail::BlockLoad::type; using BlockLoadValues = typename core::detail::BlockLoad::type; - using BlockDiscontinuityKeys = cub::BlockDiscontinuity; + using BlockDiscontinuityKeys = cub::BlockDiscontinuity; - using TilePrefixCallback = - cub::TilePrefixCallbackOp; - using BlockScan = - cub::BlockScan; + using TilePrefixCallback = cub::TilePrefixCallbackOp; + using BlockScan = cub::BlockScan; union TempStorage { diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h index b336f8e55fa..85d03ae51cb 100644 --- a/thrust/thrust/system/cuda/detail/set_operations.h +++ b/thrust/thrust/system/cuda/detail/set_operations.h @@ -300,9 +300,9 @@ struct SetOpAgent using BlockLoadValues1 = typename core::detail::BlockLoad::type; using BlockLoadValues2 = typename core::detail::BlockLoad::type; - using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState, Arch::ver>; + using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState>; - using BlockScan = cub::BlockScan; + using BlockScan = cub::BlockScan; // gather required temporary storage in a union // diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h index 1d39b161866..bb5092ba9ef 100644 --- a/thrust/thrust/system/cuda/detail/unique.h +++ b/thrust/thrust/system/cuda/detail/unique.h @@ -153,10 +153,10 @@ struct UniqueAgent using BlockLoadItems = typename core::detail::BlockLoad::type; - using BlockDiscontinuityItems = cub::BlockDiscontinuity; + using BlockDiscontinuityItems = cub::BlockDiscontinuity; - using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState, Arch::ver>; - using BlockScan = cub::BlockScan; + using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState>; + using BlockScan = cub::BlockScan; using shared_items_t = core::detail::uninitialized_array; From ca007c9ea13fbd43de5df6e0d94ebe7b7e1618f3 Mon Sep 17 00:00:00 2001 From: Cliff Burdick <30670611+cliffburdick@users.noreply.github.com> Date: Thu, 30 Jan 2025 14:02:46 -0800 Subject: [PATCH 33/33] Fix typo in index.rst (#3620) --- docs/libcudacxx/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/libcudacxx/index.rst b/docs/libcudacxx/index.rst index 0ae9c84339c..4ada47e7d85 100644 --- a/docs/libcudacxx/index.rst +++ b/docs/libcudacxx/index.rst @@ -65,7 +65,7 @@ learning curve of learning CUDA. However, there are many aspects of writing high be expressed through purely Standard conforming APIs. For these cases, libcu++ also provides *extensions* of Standard Library utilities. -For example, libcu++ extends ``atomic`` and other synchornization primitives with the notion of a “thread scope” +For example, libcu++ extends ``atomic`` and other synchronization primitives with the notion of a “thread scope” that controls the strength of the memory fence. To use utilities that are extensions to Standard Library features, drop the ``std``: