From ced506dd40f78817047d8f93d785a2f898711d2e Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Wed, 29 Jan 2025 12:49:56 -0800 Subject: [PATCH 1/4] Remove all code paths and policies for SM37 and below (#3466) --- .../adjacent_difference/subtract_left.cu | 4 +- cub/benchmarks/bench/copy/memcpy.cu | 2 +- .../bench/histogram/histogram_common.cuh | 2 +- cub/benchmarks/bench/partition/three_way.cu | 2 +- cub/benchmarks/bench/reduce/by_key.cu | 4 +- .../bench/run_length_encode/encode.cu | 4 +- .../run_length_encode/non_trivial_runs.cu | 4 +- cub/benchmarks/bench/segmented_sort/keys.cu | 4 +- cub/benchmarks/bench/select/unique_by_key.cu | 4 +- cub/benchmarks/bench/transform/common.h | 2 +- .../device/dispatch/dispatch_spmv_orig.cuh | 50 +---------- .../tuning/tuning_adjacent_difference.cuh | 14 +--- .../dispatch/tuning/tuning_batch_memcpy.cuh | 6 +- cub/cub/device/dispatch/tuning/tuning_for.cuh | 4 +- .../dispatch/tuning/tuning_histogram.cuh | 9 +- .../device/dispatch/tuning/tuning_merge.cuh | 14 +--- .../dispatch/tuning/tuning_merge_sort.cuh | 6 +- .../dispatch/tuning/tuning_radix_sort.cuh | 83 +------------------ .../device/dispatch/tuning/tuning_reduce.cuh | 23 +---- .../dispatch/tuning/tuning_reduce_by_key.cuh | 6 +- .../tuning/tuning_run_length_encode.cuh | 16 ++-- .../device/dispatch/tuning/tuning_scan.cuh | 4 +- .../dispatch/tuning/tuning_scan_by_key.cuh | 4 +- .../dispatch/tuning/tuning_segmented_sort.cuh | 28 +------ .../dispatch/tuning/tuning_select_if.cuh | 6 +- .../tuning/tuning_three_way_partition.cuh | 6 +- .../dispatch/tuning/tuning_unique_by_key.cuh | 6 +- ...vice_run_length_encode_non_trivial_runs.cu | 4 +- cub/test/catch2_test_util_device.cu | 21 ++--- cub/test/catch2_test_vsmem.cu | 8 +- docs/cub/developer_overview.rst | 6 +- docs/repo.toml | 2 +- .../libcudacxx/cuda/test_platform.pass.cpp | 24 ------ libcudacxx/test/support/concurrent_agents.h | 4 - thrust/thrust/system/cuda/detail/core/util.h | 40 +-------- thrust/thrust/system/cuda/detail/reduce.h | 16 +--- .../thrust/system/cuda/detail/reduce_by_key.h | 49 +---------- .../system/cuda/detail/set_operations.h | 21 ----- thrust/thrust/system/cuda/detail/unique.h | 30 ------- 39 files changed, 86 insertions(+), 456 deletions(-) diff --git a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu index 6976b024d37..89e4bc485e9 100644 --- a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu +++ b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu @@ -35,7 +35,7 @@ #if !TUNE_BASE struct policy_hub_t { - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using AdjacentDifferencePolicy = cub::AgentAdjacentDifferencePolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/copy/memcpy.cu b/cub/benchmarks/bench/copy/memcpy.cu index 678091cb0c0..07162bf602a 100644 --- a/cub/benchmarks/bench/copy/memcpy.cu +++ b/cub/benchmarks/bench/copy/memcpy.cu @@ -118,7 +118,7 @@ using block_delay_constructor_t = struct policy_hub_t { - struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> + struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t> { using AgentSmallBufferPolicyT = cub::detail::AgentBatchMemcpyPolicy< TUNE_THREADS, diff --git a/cub/benchmarks/bench/histogram/histogram_common.cuh b/cub/benchmarks/bench/histogram/histogram_common.cuh index d6a7f9f9173..93eea3e8e02 100644 --- a/cub/benchmarks/bench/histogram/histogram_common.cuh +++ b/cub/benchmarks/bench/histogram/histogram_common.cuh @@ -62,7 +62,7 @@ constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND; template struct policy_hub_t { - struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> + struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t> { static constexpr cub::BlockLoadAlgorithm load_algorithm = (TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED) diff --git a/cub/benchmarks/bench/partition/three_way.cu b/cub/benchmarks/bench/partition/three_way.cu index 9b1fdb0e18d..ff53970d824 100644 --- a/cub/benchmarks/bench/partition/three_way.cu +++ b/cub/benchmarks/bench/partition/three_way.cu @@ -47,7 +47,7 @@ template struct policy_hub_t { - struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> + struct policy_t : cub::ChainedPolicy<500, policy_t, policy_t> { using ThreeWayPartitionPolicy = // cub::AgentThreeWayPartitionPolicy + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index 9a62b073e75..481f9a4f2ae 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -55,7 +55,7 @@ struct reduce_by_key_policy_hub { - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu index 112b716ca86..398711fed80 100644 --- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu +++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -54,7 +54,7 @@ struct device_rle_policy_hub { - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using RleSweepPolicyT = cub::AgentRlePolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/segmented_sort/keys.cu b/cub/benchmarks/bench/segmented_sort/keys.cu index b3ecbf51656..8d793c67e44 100644 --- a/cub/benchmarks/bench/segmented_sort/keys.cu +++ b/cub/benchmarks/bench/segmented_sort/keys.cu @@ -109,7 +109,7 @@ struct device_seg_sort_policy_hub { using DominantT = KeyT; - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { static constexpr int BLOCK_THREADS = TUNE_THREADS; static constexpr int RADIX_BITS = TUNE_RADIX_BITS; @@ -143,7 +143,7 @@ struct device_seg_sort_policy_hub TUNE_M_LOAD_MODIFIER>>; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/select/unique_by_key.cu b/cub/benchmarks/bench/select/unique_by_key.cu index 7950aaeda2f..473aff6b589 100644 --- a/cub/benchmarks/bench/select/unique_by_key.cu +++ b/cub/benchmarks/bench/select/unique_by_key.cu @@ -53,7 +53,7 @@ struct policy_hub { - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using UniqueByKeyPolicyT = cub::AgentUniqueByKeyPolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; #endif // !TUNE_BASE diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h index d8339645429..3f8ad71f590 100644 --- a/cub/benchmarks/bench/transform/common.h +++ b/cub/benchmarks/bench/transform/common.h @@ -31,7 +31,7 @@ using policy_hub_t = cub::detail::transform::policy_hub + struct max_policy : cub::ChainedPolicy<500, max_policy, max_policy> { static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__); static constexpr auto algorithm = static_cast(TUNE_ALGORITHM); diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index cd377a6d991..16353f392dc 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -383,40 +383,6 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv // Tuning policies //--------------------------------------------------------------------- - /// SM35 - struct Policy350 - { - using SpmvPolicyT = - AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 96 : 128, - (sizeof(ValueT) > 4) ? 4 : 7, - LOAD_LDG, - LOAD_CA, - LOAD_LDG, - LOAD_LDG, - LOAD_LDG, - (sizeof(ValueT) > 4) ? true : false, - BLOCK_SCAN_WARP_SCANS>; - - using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>; - }; - - /// SM37 - struct Policy370 - { - using SpmvPolicyT = - AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 128 : 128, - (sizeof(ValueT) > 4) ? 9 : 14, - LOAD_LDG, - LOAD_CA, - LOAD_LDG, - LOAD_LDG, - LOAD_LDG, - false, - BLOCK_SCAN_WARP_SCANS>; - - using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>; - }; - /// SM50 struct Policy500 { @@ -459,15 +425,8 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv #if (CUB_PTX_ARCH >= 600) using PtxPolicy = Policy600; -#elif (CUB_PTX_ARCH >= 500) - using PtxPolicy = Policy500; - -#elif (CUB_PTX_ARCH >= 370) - using PtxPolicy = Policy370; - #else - using PtxPolicy = Policy350; - + using PtxPolicy = Policy500; #endif // "Opaque" policies (whose parameterizations aren't reflected in the type signature) @@ -502,12 +461,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv } else if (ptx_version >= 500) { spmv_config.template Init(); segment_fixup_config.template Init(); - } else if (ptx_version >= 370) { - spmv_config.template Init(); - segment_fixup_config.template Init(); } else { - spmv_config.template Init(); - segment_fixup_config.template Init(); + spmv_config.template Init(); + segment_fixup_config.template Init(); })); } diff --git a/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh b/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh index 20717e1c68a..b8d0a7557bd 100644 --- a/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh @@ -52,17 +52,7 @@ struct policy_hub { using ValueT = typename std::iterator_traits::value_type; - struct Policy300 : ChainedPolicy<300, Policy300, Policy300> - { - using AdjacentDifferencePolicy = - AgentAdjacentDifferencePolicy<128, - Nominal8BItemsToItems(7), - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE>; - }; - - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { using AdjacentDifferencePolicy = AgentAdjacentDifferencePolicy<128, @@ -72,7 +62,7 @@ struct policy_hub BLOCK_STORE_WARP_TRANSPOSE>; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; } // namespace adjacent_difference } // namespace detail diff --git a/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh b/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh index 86fe3374d89..d0ebefe0a1f 100644 --- a/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh @@ -75,8 +75,8 @@ struct policy_hub using buff_delay_constructor_t = detail::default_delay_constructor_t; using block_delay_constructor_t = detail::default_delay_constructor_t; - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + /// SM50 + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { static constexpr bool PREFER_POW2_BITS = true; using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy< @@ -95,7 +95,7 @@ struct policy_hub }; /// SM70 - struct Policy700 : ChainedPolicy<700, Policy700, Policy350> + struct Policy700 : ChainedPolicy<700, Policy700, Policy500> { static constexpr bool PREFER_POW2_BITS = false; using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy< diff --git a/cub/cub/device/dispatch/tuning/tuning_for.cuh b/cub/cub/device/dispatch/tuning/tuning_for.cuh index 759d7e632e5..d0ec964ca90 100644 --- a/cub/cub/device/dispatch/tuning/tuning_for.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_for.cuh @@ -49,12 +49,12 @@ namespace for_each struct policy_hub_t { - struct policy_350_t : ChainedPolicy<350, policy_350_t, policy_350_t> + struct policy_500_t : ChainedPolicy<500, policy_500_t, policy_500_t> { using for_policy_t = policy_t<256, 2>; }; - using MaxPolicy = policy_350_t; + using MaxPolicy = policy_500_t; }; } // namespace for_each diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 1a06c25cb92..bd19489971e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -136,15 +136,8 @@ struct policy_hub return (::cuda::std::max)(nominalItemsPerThread / NumActiveChannels / v_scale, 1); } - // SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> - { - // TODO This might be worth it to separate usual histogram and the multi one - using AgentHistogramPolicyT = AgentHistogramPolicy<128, t_scale(8), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>; - }; - // SM50 - struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { // TODO This might be worth it to separate usual histogram and the multi one using AgentHistogramPolicyT = diff --git a/cub/cub/device/dispatch/tuning/tuning_merge.cuh b/cub/cub/device/dispatch/tuning/tuning_merge.cuh index 0d69dd45b95..2521de6e9c3 100644 --- a/cub/cub/device/dispatch/tuning/tuning_merge.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_merge.cuh @@ -53,17 +53,7 @@ struct policy_hub using tune_type = char[has_values ? sizeof(KeyT) + sizeof(ValueT) : sizeof(KeyT)]; - struct policy300 : ChainedPolicy<300, policy300, policy300> - { - using merge_policy = - agent_policy_t<128, - Nominal4BItemsToItems(7), - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE>; - }; - - struct policy350 : ChainedPolicy<350, policy350, policy300> + struct policy500 : ChainedPolicy<500, policy500, policy500> { using merge_policy = agent_policy_t<256, @@ -73,7 +63,7 @@ struct policy_hub BLOCK_STORE_WARP_TRANSPOSE>; }; - struct policy520 : ChainedPolicy<520, policy520, policy350> + struct policy520 : ChainedPolicy<520, policy520, policy500> { using merge_policy = agent_policy_t<512, diff --git a/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh index 94d54b08509..29e98a3898a 100644 --- a/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh @@ -51,7 +51,7 @@ struct policy_hub { using KeyT = value_t; - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { using MergeSortPolicy = AgentMergeSortPolicy<256, @@ -63,9 +63,9 @@ struct policy_hub // NVBug 3384810 #if defined(_NVHPC_CUDA) - using Policy520 = Policy350; + using Policy520 = Policy500; #else - struct Policy520 : ChainedPolicy<520, Policy520, Policy350> + struct Policy520 : ChainedPolicy<520, Policy520, Policy500> { using MergeSortPolicy = AgentMergeSortPolicy<512, diff --git a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh index 99b8dbda413..72c464ec5ea 100644 --- a/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_radix_sort.cuh @@ -120,89 +120,8 @@ struct policy_hub // Architecture-specific tuning policies //------------------------------------------------------------------------------ - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> - { - enum - { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented - // keys/s (K40m) - ONESWEEP = false, - ONESWEEP_RADIX_BITS = 8, - }; - - // Histogram policy - using HistogramPolicy = AgentRadixSortHistogramPolicy<256, 8, 1, KeyT, ONESWEEP_RADIX_BITS>; - - // Exclusive sum policy - using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>; - - // Onesweep policy - using OnesweepPolicy = AgentRadixSortOnesweepPolicy< - 256, - 21, - DominantT, - 1, - RADIX_RANK_MATCH_EARLY_COUNTS_ANY, - BLOCK_SCAN_WARP_SCANS, - RADIX_SORT_STORE_DIRECT, - ONESWEEP_RADIX_BITS>; - - // Scan policy - using ScanPolicy = - AgentScanPolicy<1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS>; - - // Keys-only downsweep policies - using DownsweepPolicyKeys = AgentRadixSortDownsweepPolicy< - 128, - 9, - DominantT, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_LDG, - RADIX_RANK_MATCH, - BLOCK_SCAN_WARP_SCANS, - PRIMARY_RADIX_BITS>; - using AltDownsweepPolicyKeys = AgentRadixSortDownsweepPolicy< - 64, - 18, - DominantT, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - RADIX_RANK_MEMOIZE, - BLOCK_SCAN_WARP_SCANS, - PRIMARY_RADIX_BITS - 1>; - - // Key-value pairs downsweep policies - using DownsweepPolicyPairs = DownsweepPolicyKeys; - using AltDownsweepPolicyPairs = AgentRadixSortDownsweepPolicy< - 128, - 15, - DominantT, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - RADIX_RANK_MEMOIZE, - BLOCK_SCAN_WARP_SCANS, - PRIMARY_RADIX_BITS - 1>; - - // Downsweep policies - using DownsweepPolicy = ::cuda::std::_If; - - using AltDownsweepPolicy = ::cuda::std::_If; - - // Upsweep policies - using UpsweepPolicy = DownsweepPolicy; - using AltUpsweepPolicy = AltDownsweepPolicy; - - // Single-tile policy - using SingleTilePolicy = DownsweepPolicy; - - // Segmented policies - using SegmentedPolicy = DownsweepPolicy; - using AltSegmentedPolicy = AltDownsweepPolicy; - }; - /// SM50 - struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { enum { diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh index a87b6b9d6d6..d4719820752 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh @@ -79,26 +79,7 @@ CUB_RUNTIME_FUNCTION ReducePolicyWrapper MakeReducePolicyWrapper(Policy template struct policy_hub { - struct Policy300 : ChainedPolicy<300, Policy300, Policy300> - { - static constexpr int threads_per_block = 256; - static constexpr int items_per_thread = 20; - static constexpr int items_per_vec_load = 2; - - // ReducePolicy (GTX670: 154.0 @ 48M 4B items) - using ReducePolicy = - AgentReducePolicy; - - using SingleTilePolicy = ReducePolicy; - using SegmentedReducePolicy = ReducePolicy; - }; - - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 20; @@ -117,7 +98,7 @@ struct policy_hub using SegmentedReducePolicy = ReducePolicy; }; - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + struct Policy600 : ChainedPolicy<600, Policy600, Policy500> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 16; diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 41fbb2c49a4..a5ad19df8cc 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -633,9 +633,9 @@ struct policy_hub default_reduce_by_key_delay_constructor_t>; }; - struct Policy350 + struct Policy500 : DefaultPolicy - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick DefaultPolicy @@ -651,7 +651,7 @@ struct policy_hub template static auto select_agent_policy(long) -> typename DefaultPolicy::ReduceByKeyPolicyT; - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { using ReduceByKeyPolicyT = decltype(select_agent_policy()>>(0)); diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index 87631d1199e..d938209dcf2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -258,10 +258,10 @@ struct policy_hub default_reduce_by_key_delay_constructor_t>; }; - // SM35 - struct Policy350 + // SM50 + struct Policy500 : DefaultPolicy - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick the default @@ -277,7 +277,7 @@ struct policy_hub static auto select_agent_policy(long) -> typename DefaultPolicy::ReduceByKeyPolicyT; // SM80 - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { using ReduceByKeyPolicyT = decltype(select_agent_policy>(0)); }; @@ -451,10 +451,10 @@ struct policy_hub default_reduce_by_key_delay_constructor_t>; }; - // SM35 - struct Policy350 + // SM50 + struct Policy500 : DefaultPolicy // TODO(bgruber): I think we want `LengthT` instead of `int` - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick the default @@ -472,7 +472,7 @@ struct policy_hub typename DefaultPolicy::RleSweepPolicyT; // SM80 - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { using RleSweepPolicyT = decltype(select_agent_policy>(0)); }; diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index ae0d34ede32..7b076507341 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -273,13 +273,13 @@ struct policy_hub static constexpr BlockStoreAlgorithm scan_transposed_store = large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T using ScanPolicyT = AgentScanPolicy<128, 12, AccumT, BLOCK_LOAD_DIRECT, LOAD_CA, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, BLOCK_SCAN_RAKING>; }; - struct Policy520 : ChainedPolicy<520, Policy520, Policy350> + struct Policy520 : ChainedPolicy<520, Policy520, Policy500> { // Titan X: 32.47B items/s @ 48M 32-bit T using ScanPolicyT = diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index cdd2468dc38..f8e29201eea 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -717,7 +717,7 @@ struct policy_hub static constexpr int max_input_bytes = static_cast((::cuda::std::max)(sizeof(key_t), sizeof(AccumT))); static constexpr int combined_input_bytes = static_cast(sizeof(key_t) + sizeof(AccumT)); - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { static constexpr int nominal_4b_items_per_thread = 6; static constexpr int items_per_thread = @@ -752,7 +752,7 @@ struct policy_hub struct Policy520 : DefaultPolicy - , ChainedPolicy<520, Policy520, Policy350> + , ChainedPolicy<520, Policy520, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick the default diff --git a/cub/cub/device/dispatch/tuning/tuning_segmented_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_segmented_sort.cuh index fc442a4f982..308949d0916 100644 --- a/cub/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_segmented_sort.cuh @@ -53,33 +53,7 @@ struct policy_hub using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>; static constexpr int KEYS_ONLY = ::cuda::std::is_same::value; - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> - { - static constexpr int BLOCK_THREADS = 128; - static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; - static constexpr int PARTITIONING_THRESHOLD = 300; - - using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy< - BLOCK_THREADS, - 9, - DominantT, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - RADIX_RANK_MATCH, - BLOCK_SCAN_WARP_SCANS, - RADIX_BITS>; - - static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(5); - static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(5); - using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< - BLOCK_THREADS, - // Small policy - AgentSubWarpMergeSortPolicy<4, ITEMS_PER_SMALL_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>, - // Medium policy - AgentSubWarpMergeSortPolicy<32, ITEMS_PER_MEDIUM_THREAD, WARP_LOAD_DIRECT, LOAD_DEFAULT>>; - }; - - struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { static constexpr int BLOCK_THREADS = 256; static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 792b1669fa1..10d22286068 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -566,12 +566,12 @@ struct policy_hub detail::fixed_delay_constructor_t<350, 450>>; }; - struct Policy350 + struct Policy500 : DefaultPolicy - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { // Use values from tuning if a specialization exists, otherwise pick the default template diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh index 3645e4b9ed7..08364fe381d 100644 --- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh @@ -239,9 +239,9 @@ struct policy_hub DelayConstructor>; }; - struct Policy350 + struct Policy500 : DefaultPolicy> - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick DefaultPolicy @@ -258,7 +258,7 @@ struct policy_hub static auto select_agent_policy(long) -> typename DefaultPolicy< default_delay_constructor_t::pack_t>>::ThreeWayPartitionPolicy; - struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + struct Policy800 : ChainedPolicy<800, Policy800, Policy500> { using ThreeWayPartitionPolicy = decltype(select_agent_policy>(0)); }; diff --git a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh index f988d6fb29e..0c6b717de2c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh @@ -538,9 +538,9 @@ struct policy_hub detail::default_delay_constructor_t>; }; - struct Policy350 + struct Policy500 : DefaultPolicy<9, 128> - , ChainedPolicy<350, Policy350, Policy350> + , ChainedPolicy<500, Policy500, Policy500> {}; // Use values from tuning if a specialization exists, otherwise pick the default @@ -557,7 +557,7 @@ struct policy_hub struct Policy520 : DefaultPolicy<11, 64> - , ChainedPolicy<520, Policy520, Policy350> + , ChainedPolicy<520, Policy520, Policy500> {}; struct Policy800 : ChainedPolicy<800, Policy800, Policy520> diff --git a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu index db655b73404..12f0467d12b 100644 --- a/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu +++ b/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu @@ -258,13 +258,13 @@ struct device_rle_policy_hub static constexpr int threads = 96; static constexpr int items = 15; - struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> + struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500> { using RleSweepPolicyT = cub:: AgentRlePolicy; }; - using MaxPolicy = Policy350; + using MaxPolicy = Policy500; }; struct CustomDeviceRunLengthEncode diff --git a/cub/test/catch2_test_util_device.cu b/cub/test/catch2_test_util_device.cu index 4c4d10a2ff1..da0eea09eba 100644 --- a/cub/test/catch2_test_util_device.cu +++ b/cub/test/catch2_test_util_device.cu @@ -125,9 +125,7 @@ C2H_TEST("PtxVersion returns a value from __CUDA_ARCH_LIST__/NV_TARGET_SM_INTEGE struct policy_hub_all { // for the list of supported architectures, see libcudacxx/include/nv/target - GEN_POLICY(350, 350); - GEN_POLICY(370, 350); - GEN_POLICY(500, 370); + GEN_POLICY(500, 500); GEN_POLICY(520, 500); GEN_POLICY(530, 520); GEN_POLICY(600, 530); @@ -258,8 +256,7 @@ DECLARE_TMPL_LAUNCH_WRAPPER(check_chained_policy_selects_correct_policy, struct policy_hub_some { - GEN_POLICY(350, 350); - GEN_POLICY(500, 350); + GEN_POLICY(500, 500); GEN_POLICY(700, 500); GEN_POLICY(900, 700); GEN_POLICY(2000, 900); // non-existing architecture, just to test @@ -268,30 +265,30 @@ struct policy_hub_some struct policy_hub_few { - GEN_POLICY(350, 350); - GEN_POLICY(860, 350); + GEN_POLICY(500, 500); + GEN_POLICY(860, 500); GEN_POLICY(2000, 860); // non-existing architecture, just to test using max_policy = policy2000; }; struct policy_hub_minimal { - GEN_POLICY(350, 350); - using max_policy = policy350; + GEN_POLICY(500, 500); + using max_policy = policy500; }; C2H_TEST("ChainedPolicy invokes correct policy", "[util][dispatch]") { SECTION("policy_hub_some") { - check_wrapper_some(::cuda::std::array{350, 500, 700, 900, 2000}); + check_wrapper_some(::cuda::std::array{500, 700, 900, 2000}); } SECTION("policy_hub_few") { - check_wrapper_some(::cuda::std::array{350, 860, 2000}); + check_wrapper_some(::cuda::std::array{500, 860, 2000}); } SECTION("policy_hub_minimal") { - check_wrapper_some(::cuda::std::array{350}); + check_wrapper_some(::cuda::std::array{500}); } } diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu index 6b16bde7fa9..557f2c152d0 100644 --- a/cub/test/catch2_test_vsmem.cu +++ b/cub/test/catch2_test_vsmem.cu @@ -198,7 +198,7 @@ struct device_dummy_algorithm_policy_t static constexpr int FALLBACK_BLOCK_THREADS = 64; - struct policy_350 : cub::ChainedPolicy<350, policy_350, policy_350> + struct policy_500 : cub::ChainedPolicy<500, policy_500, policy_500> { using DummyAlgorithmPolicy = agent_dummy_algorithm_policy_t<256, cub::Nominal4BItemsToItems(17)>; @@ -208,7 +208,7 @@ struct device_dummy_algorithm_policy_t }; /// MaxPolicy - using max_policy_t = policy_350; + using max_policy_t = policy_500; }; //---------------------------------------------------------------------------- @@ -422,9 +422,9 @@ C2H_TEST("Virtual shared memory works within algorithms", "[util][vsmem]", type_ c2h::gen(C2H_SEED(1), in); // Query default and fallback policies and agents so we can confirm vsmem - using default_policy_t = typename device_dummy_algorithm_policy_t::policy_350::DummyAlgorithmPolicy; + using default_policy_t = typename device_dummy_algorithm_policy_t::policy_500::DummyAlgorithmPolicy; using default_agent_t = agent_dummy_algorithm_t; - using fallback_policy_t = typename device_dummy_algorithm_policy_t::policy_350::FallbackDummyAlgorithmPolicy; + using fallback_policy_t = typename device_dummy_algorithm_policy_t::policy_500::FallbackDummyAlgorithmPolicy; using fallback_agent_t = agent_dummy_algorithm_t; // Get the information as it is expected from the vsmem helper to work as expected diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst index 4cc639e27fb..8b31dab6283 100644 --- a/docs/cub/developer_overview.rst +++ b/docs/cub/developer_overview.rst @@ -625,14 +625,14 @@ Finally, the tuning policy hub looks like: struct policy_hub { // TuningRelevantParams... could be used for decision making, like element types used, iterator category, etc. - // for SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> { + // for SM50 + struct Policy500 : ChainedPolicy<500, Policy500, Policy500> { using AlgorithmPolicy = AgentAlgorithmPolicy<256, 20, BLOCK_LOAD_DIRECT, LOAD_LDG>; // ... additional policies may exist, often one per agent }; // for SM60 - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> { + struct Policy600 : ChainedPolicy<600, Policy600, Policy500> { using AlgorithmPolicy = AgentAlgorithmPolicy<256, 16, BLOCK_LOAD_DIRECT, LOAD_LDG>; }; diff --git a/docs/repo.toml b/docs/repo.toml index 999d62a8f20..7ff29fd6eba 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -115,7 +115,7 @@ doxygen_aliases = [ "smemstorage{1}=The operations exposed by \\1 require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the ``__shared__`` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or ``union``'d with other storage allocation types to facilitate memory reuse.", "granularity=Efficiency is increased with increased granularity ``ITEMS_PER_THREAD``. Performance is also typically increased until the additional register pressure or shared memory allocation size causes SM occupancy to fall too low. Consider variants of ``cub::BlockLoad`` for efficiently gathering a :ref:`blocked arrangement ` of elements across threads.", "blocksize=The number of threads in the block is a multiple of the architecture's warp size", - "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the ``__CUDA_ARCH__`` macro (e.g., 350 for sm_35). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of ``__CUDA_ARCH__`` during the current compiler pass)", + "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the ``__CUDA_ARCH__`` macro (e.g., 750 for sm_75). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of ``__CUDA_ARCH__`` during the current compiler pass)", "blockcollective{1}=Every thread in the block uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.", "warpcollective{1}=Every thread in the warp uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking or more collective member functions.", "devicestorage=When ``d_temp_storage`` is ``nullptr``, no work is done and the required allocation size is returned in ``temp_storage_bytes``.", diff --git a/libcudacxx/test/libcudacxx/cuda/test_platform.pass.cpp b/libcudacxx/test/libcudacxx/cuda/test_platform.pass.cpp index 499c57a9a7e..25181f48a33 100644 --- a/libcudacxx/test/libcudacxx/cuda/test_platform.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/test_platform.pass.cpp @@ -65,10 +65,6 @@ __host__ __device__ void test() (static_assert(arch_val >= 520, "cuda arch expected 520");), NV_PROVIDES_SM_50, (static_assert(arch_val >= 500, "cuda arch expected 500");), - NV_PROVIDES_SM_37, - (static_assert(arch_val >= 370, "cuda arch expected 370");), - NV_PROVIDES_SM_35, - (static_assert(arch_val >= 350, "cuda arch expected 350");), NV_IS_HOST, (static_assert(arch_val == 0, "cuda arch expected 0");)) @@ -98,10 +94,6 @@ __host__ __device__ void test() (static_assert(arch_val == 520, "cuda arch expected 520");), NV_IS_EXACTLY_SM_50, (static_assert(arch_val == 500, "cuda arch expected 500");), - NV_IS_EXACTLY_SM_37, - (static_assert(arch_val == 370, "cuda arch expected 370");), - NV_IS_EXACTLY_SM_35, - (static_assert(arch_val == 350, "cuda arch expected 350");), NV_IS_HOST, (static_assert(arch_val == 0, "cuda arch expected 0");)) @@ -158,10 +150,6 @@ __host__ __device__ void test() (invoke_count += 1; invoke_count += threadIdx.x;), NV_PROVIDES_SM_50, (invoke_count += 1; invoke_count += threadIdx.x;), - NV_PROVIDES_SM_37, - (invoke_count += 1; invoke_count += threadIdx.x;), - NV_PROVIDES_SM_35, - (invoke_count += 1; invoke_count += threadIdx.x;), NV_IS_HOST, (invoke_count += 1;)) @@ -188,10 +176,6 @@ __host__ __device__ void test() (invoke_count += 1; invoke_count += threadIdx.x;), NV_IS_EXACTLY_SM_50, (invoke_count += 1; invoke_count += threadIdx.x;), - NV_IS_EXACTLY_SM_37, - (invoke_count += 1; invoke_count += threadIdx.x;), - NV_IS_EXACTLY_SM_35, - (invoke_count += 1; invoke_count += threadIdx.x;), NV_IS_HOST, (invoke_count += 1;)) @@ -252,10 +236,6 @@ void test() (static_assert(arch_val == 520, "cuda arch expected 520");), NV_PROVIDES_SM_50, (static_assert(arch_val == 500, "cuda arch expected 500");), - NV_PROVIDES_SM_37, - (static_assert(arch_val == 370, "cuda arch expected 370");), - NV_PROVIDES_SM_35, - (static_assert(arch_val == 350, "cuda arch expected 350");), NV_IS_HOST, (static_assert(arch_val == 0, "cuda arch expected 0");)) @@ -281,10 +261,6 @@ void test() (static_assert(arch_val == 520, "cuda arch expected 520");), NV_IS_EXACTLY_SM_50, (static_assert(arch_val == 500, "cuda arch expected 500");), - NV_IS_EXACTLY_SM_37, - (static_assert(arch_val == 370, "cuda arch expected 370");), - NV_IS_EXACTLY_SM_35, - (static_assert(arch_val == 350, "cuda arch expected 350");), NV_IS_HOST, (static_assert(arch_val == 0, "cuda arch expected 0");)) diff --git a/libcudacxx/test/support/concurrent_agents.h b/libcudacxx/test/support/concurrent_agents.h index 6b57b3531a0..6419613a5d8 100644 --- a/libcudacxx/test/support/concurrent_agents.h +++ b/libcudacxx/test/support/concurrent_agents.h @@ -13,10 +13,6 @@ #ifndef __CUDA_ARCH__ # include -#else -# if __CUDA_ARCH__ < 350 -# error "This test requires CUDA dynamic parallelism to work." -# endif #endif #include diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index a3c8994d777..94a7e750aeb 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -64,32 +64,17 @@ namespace core # if (__NVCOMPILER_CUDA_ARCH__ >= 600) // deprecated [since 2.8] # define THRUST_TUNING_ARCH sm60 -# elif (__NVCOMPILER_CUDA_ARCH__ >= 520) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm52 -# elif (__NVCOMPILER_CUDA_ARCH__ >= 350) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm35 # else // deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm30 +# define THRUST_TUNING_ARCH sm52 # endif #else # if (__CUDA_ARCH__ >= 600) // deprecated [since 2.8] # define THRUST_TUNING_ARCH sm60 -# elif (__CUDA_ARCH__ >= 520) +# else // deprecated [since 2.8] # define THRUST_TUNING_ARCH sm52 -# elif (__CUDA_ARCH__ >= 350) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm35 -# elif (__CUDA_ARCH__ >= 300) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm30 -# elif !defined(__CUDA_ARCH__) -// deprecated [since 2.8] -# define THRUST_TUNING_ARCH sm30 # endif #endif @@ -101,22 +86,7 @@ struct typelist; // supported SM arch // --------------------- -struct sm30 -{ - enum - { - ver = 300, - warpSize = 32 - }; -}; -struct sm35 -{ - enum - { - ver = 350, - warpSize = 32 - }; -}; + struct sm52 { enum @@ -137,7 +107,7 @@ struct sm60 // list of sm, checked from left to right order // the rightmost is the lowest sm arch supported // -------------------------------------------- -using sm_list = typelist; +using sm_list = typelist; // lowest supported SM arch // -------------------------------------------------------------------------- @@ -784,8 +754,6 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage( } } // namespace core -using core::sm30; -using core::sm35; using core::sm52; using core::sm60; } // namespace cuda_cub diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h index 443063fb3b4..3787ab62367 100644 --- a/thrust/thrust/system/cuda/detail/reduce.h +++ b/thrust/thrust/system/cuda/detail/reduce.h @@ -109,7 +109,7 @@ template struct Tuning; template -struct Tuning +struct Tuning { enum { @@ -119,18 +119,6 @@ struct Tuning SCALE_FACTOR_1B = sizeof(T), }; - using type = - PtxPolicy<256, - (((20 / SCALE_FACTOR_4B) > (1)) ? (20 / SCALE_FACTOR_4B) : (1)), - 2, - cub::BLOCK_REDUCE_WARP_REDUCTIONS, - cub::LOAD_DEFAULT, - cub::GRID_MAPPING_RAKE>; -}; // Tuning sm30 - -template -struct Tuning : Tuning -{ // ReducePolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items) using ReducePolicy1B = PtxPolicy<128, @@ -150,7 +138,7 @@ struct Tuning : Tuning cub::GRID_MAPPING_DYNAMIC>; using type = ::cuda::std::conditional_t<(sizeof(T) < 4), ReducePolicy1B, ReducePolicy4B>; -}; // Tuning sm35 +}; // Tuning sm52 template struct ReduceAgent diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h index cc59c98ab2c..ae1f0ffab96 100644 --- a/thrust/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h @@ -115,54 +115,7 @@ template struct Tuning; template -struct Tuning -{ - enum - { - MAX_INPUT_BYTES = mpl::max::value, - COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value), - - NOMINAL_4B_ITEMS_PER_THREAD = 6, - - ITEMS_PER_THREAD = - mpl::min(((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) - / COMBINED_INPUT_BYTES)>::value>::value, - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS>; -}; // Tuning sm30 - -template -struct Tuning : Tuning -{ - enum - { - MAX_INPUT_BYTES = mpl::max::value, - COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value), - - NOMINAL_4B_ITEMS_PER_THREAD = 6, - - ITEMS_PER_THREAD = - (MAX_INPUT_BYTES <= 8) - ? 6 - : mpl::min< - int, - NOMINAL_4B_ITEMS_PER_THREAD, - mpl::max:: - value>::value, - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS>; -}; // Tuning sm35 - -template -struct Tuning : Tuning +struct Tuning { enum { diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h index 0ef80c0fb2d..7a267080bf8 100644 --- a/thrust/thrust/system/cuda/detail/set_operations.h +++ b/thrust/thrust/system/cuda/detail/set_operations.h @@ -221,27 +221,6 @@ struct Tuning; namespace mpl = thrust::detail::mpl::math; -template -struct Tuning -{ - enum - { - MAX_INPUT_BYTES = mpl::max::value, - COMBINED_INPUT_BYTES = sizeof(T), // + sizeof(Value), - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = - mpl::min(((NOMINAL_4B_ITEMS_PER_THREAD * 4) + COMBINED_INPUT_BYTES - 1) - / COMBINED_INPUT_BYTES)>::value>::value, - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS>; -}; // tuning sm30 - template struct Tuning { diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h index b8e408254cb..ac94017758b 100644 --- a/thrust/thrust/system/cuda/detail/unique.h +++ b/thrust/thrust/system/cuda/detail/unique.h @@ -137,36 +137,6 @@ struct Tuning PtxPolicy<64, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS>; }; // Tuning for sm52 -template -struct Tuning -{ - const static int INPUT_SIZE = sizeof(T); - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - // - ITEMS_PER_THREAD = items_per_thread::value - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS>; -}; // Tuning for sm35 - -template -struct Tuning -{ - const static int INPUT_SIZE = sizeof(T); - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - // - ITEMS_PER_THREAD = items_per_thread::value - }; - - using type = - PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS>; -}; // Tuning for sm30 - template struct UniqueAgent { From d21e0c9804ad63d23950c8b0a2462e5b7ebc8701 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 29 Jan 2025 22:52:12 +0100 Subject: [PATCH 2/4] PTX: Update generated files with Blackwell instructions (#3568) * ptx: Update existing instructions * ptx: Add new instructions * Fix returning error out values See: - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/74 - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/73 * ptx: Fix out var declaration See https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/75 * mbarrier.{test,try}_wait: Fix test. Wrong files were included. * docs: Fix special registers include * Allow non-included documentation pages * Workaround NVRTC Co-authored-by: Allard Hendriksen --- .../generated/barrier_cluster_aligned.rst | 63 + .../generated/clusterlaunchcontrol.rst | 68 + .../instructions/generated/cp_async_bulk.rst | 38 +- .../generated/cp_async_bulk_multicast.rst | 2 +- .../generated/cp_async_bulk_tensor.rst | 280 +- .../cp_async_bulk_tensor_gather_scatter.rst | 124 + .../cp_async_bulk_tensor_multicast.rst | 200 +- .../generated/cp_async_mbarrier_arrive.rst | 11 + .../cp_async_mbarrier_arrive_noinc.rst | 11 + .../generated/cp_reduce_async_bulk.rst | 6 +- .../ptx/instructions/generated/elect_sync.rst | 11 + .../ptx/instructions/generated/fence.rst | 170 +- .../generated/fence_proxy_async.rst | 6 +- ...ence_proxy_async_generic_sync_restrict.rst | 30 + .../generated/fence_sync_restrict.rst | 30 + .../{special_registers.rst => get_sreg.rst} | 99 +- .../ptx/instructions/generated/getctarank.rst | 2 +- .../ptx/instructions/generated/mapa.rst | 14 + .../generated/mbarrier_arrive.rst | 105 +- .../generated/mbarrier_arrive_expect_tx.rst | 54 +- .../generated/mbarrier_test_wait.rst | 34 +- .../generated/mbarrier_test_wait_parity.rst | 34 +- .../generated/mbarrier_try_wait.rst | 70 +- .../generated/mbarrier_try_wait_parity.rst | 70 +- .../generated/multimem_ld_reduce.rst | 2396 ++++++ .../instructions/generated/multimem_red.rst | 2306 ++++++ .../instructions/generated/multimem_st.rst | 250 + .../ptx/instructions/generated/red_async.rst | 32 +- .../ptx/instructions/generated/st_async.rst | 10 +- .../ptx/instructions/generated/st_bulk.rst | 13 + .../instructions/generated/tcgen05_alloc.rst | 70 + .../instructions/generated/tcgen05_commit.rst | 48 + .../ptx/instructions/generated/tcgen05_cp.rst | 434 ++ .../instructions/generated/tcgen05_fence.rst | 18 + .../ptx/instructions/generated/tcgen05_ld.rst | 758 ++ .../instructions/generated/tcgen05_mma.rst | 2378 ++++++ .../instructions/generated/tcgen05_mma_ws.rst | 4482 ++++++++++++ .../instructions/generated/tcgen05_shift.rst | 24 + .../ptx/instructions/generated/tcgen05_st.rst | 758 ++ .../instructions/generated/tcgen05_wait.rst | 18 + .../generated/tensormap_replace.rst | 114 +- .../ptx/instructions/special_registers.rst | 2 +- docs/repo.toml | 2 +- .../instructions/generated/barrier_cluster.h | 66 +- .../generated/barrier_cluster_aligned.h | 130 + .../generated/clusterlaunchcontrol.h | 240 + .../instructions/generated/cp_async_bulk.h | 153 +- .../generated/cp_async_bulk_commit_group.h | 12 +- .../generated/cp_async_bulk_multicast.h | 35 +- .../generated/cp_async_bulk_tensor.h | 849 ++- .../cp_async_bulk_tensor_gather_scatter.h | 288 + .../cp_async_bulk_tensor_multicast.h | 515 +- .../generated/cp_async_bulk_wait_group.h | 24 +- .../generated/cp_async_mbarrier_arrive.h | 26 + .../cp_async_mbarrier_arrive_noinc.h | 26 + .../generated/cp_reduce_async_bulk.h | 944 +-- .../generated/cp_reduce_async_bulk_bf16.h | 78 +- .../generated/cp_reduce_async_bulk_f16.h | 78 +- .../generated/cp_reduce_async_bulk_tensor.h | 788 +- .../__ptx/instructions/generated/elect_sync.h | 36 + .../cuda/__ptx/instructions/generated/fence.h | 224 +- .../generated/fence_mbarrier_init.h | 16 +- .../generated/fence_proxy_alias.h | 12 +- .../generated/fence_proxy_async.h | 44 +- .../fence_proxy_async_generic_sync_restrict.h | 62 + .../generated/fence_proxy_tensormap_generic.h | 96 +- .../generated/fence_sync_restrict.h | 62 + .../__ptx/instructions/generated/get_sreg.h | 506 +- .../__ptx/instructions/generated/getctarank.h | 21 +- .../cuda/__ptx/instructions/generated/mapa.h | 33 + .../instructions/generated/mbarrier_arrive.h | 338 +- .../generated/mbarrier_arrive_expect_tx.h | 153 +- .../generated/mbarrier_arrive_no_complete.h | 22 +- .../generated/mbarrier_expect_tx.h | 94 + .../instructions/generated/mbarrier_init.h | 12 +- .../generated/mbarrier_test_wait.h | 133 +- .../generated/mbarrier_test_wait_parity.h | 132 +- .../generated/mbarrier_try_wait.h | 278 +- .../generated/mbarrier_try_wait_parity.h | 278 +- .../generated/multimem_ld_reduce.h | 2148 ++++++ .../instructions/generated/multimem_red.h | 1272 ++++ .../instructions/generated/multimem_st.h | 186 + .../__ptx/instructions/generated/red_async.h | 335 +- .../__ptx/instructions/generated/st_async.h | 118 +- .../__ptx/instructions/generated/st_bulk.h | 31 + .../instructions/generated/tcgen05_alloc.h | 105 + .../instructions/generated/tcgen05_commit.h | 81 + .../__ptx/instructions/generated/tcgen05_cp.h | 612 ++ .../instructions/generated/tcgen05_fence.h | 44 + .../__ptx/instructions/generated/tcgen05_ld.h | 4446 ++++++++++++ .../instructions/generated/tcgen05_mma.h | 3842 ++++++++++ .../instructions/generated/tcgen05_mma_ws.h | 6438 +++++++++++++++++ .../instructions/generated/tcgen05_shift.h | 36 + .../__ptx/instructions/generated/tcgen05_st.h | 4554 ++++++++++++ .../instructions/generated/tcgen05_wait.h | 44 + .../generated/tensormap_cp_fenceproxy.h | 68 +- .../generated/tensormap_replace.h | 630 +- .../ptx/generated/barrier_cluster_aligned.h | 61 + .../cuda/ptx/generated/clusterlaunchcontrol.h | 84 + .../cuda/ptx/generated/cp_async_bulk.h | 29 +- .../ptx/generated/cp_async_bulk_multicast.h | 28 +- .../cuda/ptx/generated/cp_async_bulk_tensor.h | 325 +- .../cp_async_bulk_tensor_gather_scatter.h | 180 + .../cp_async_bulk_tensor_multicast.h | 405 +- .../ptx/generated/cp_async_mbarrier_arrive.h | 26 + .../cp_async_mbarrier_arrive_noinc.h | 26 + .../cuda/ptx/generated/elect_sync.h | 26 + .../libcudacxx/cuda/ptx/generated/fence.h | 64 +- .../fence_proxy_async_generic_sync_restrict.h | 38 + .../cuda/ptx/generated/fence_sync_restrict.h | 38 + .../test/libcudacxx/cuda/ptx/generated/mapa.h | 27 + .../cuda/ptx/generated/mbarrier_arrive.h | 56 + .../ptx/generated/mbarrier_arrive_expect_tx.h | 29 + .../cuda/ptx/generated/mbarrier_expect_tx.h | 50 + .../cuda/ptx/generated/mbarrier_test_wait.h | 55 + .../ptx/generated/mbarrier_test_wait_parity.h | 55 + .../cuda/ptx/generated/mbarrier_try_wait.h | 31 + .../ptx/generated/mbarrier_try_wait_parity.h | 32 + .../cuda/ptx/generated/mbarrier_wait.h | 24 - .../cuda/ptx/generated/mbarrier_wait_parity.h | 24 - .../cuda/ptx/generated/multimem_ld_reduce.h | 1020 +++ .../cuda/ptx/generated/multimem_red.h | 840 +++ .../cuda/ptx/generated/multimem_st.h | 110 + .../libcudacxx/cuda/ptx/generated/st_bulk.h | 26 + .../cuda/ptx/generated/tcgen05_alloc.h | 81 + .../cuda/ptx/generated/tcgen05_commit.h | 62 + .../cuda/ptx/generated/tcgen05_cp.h | 396 + .../cuda/ptx/generated/tcgen05_fence.h | 44 + .../cuda/ptx/generated/tcgen05_ld.h | 1012 +++ .../cuda/ptx/generated/tcgen05_mma.h | 2928 ++++++++ .../cuda/ptx/generated/tcgen05_mma_ws.h | 3570 +++++++++ .../cuda/ptx/generated/tcgen05_shift.h | 39 + .../cuda/ptx/generated/tcgen05_st.h | 1012 +++ .../cuda/ptx/generated/tcgen05_wait.h | 40 + .../cuda/ptx/generated/tensormap_replace.h | 390 +- .../libcudacxx/cuda/ptx/nvrtc_workaround.h | 34 + .../ptx/ptx.barrier.cluster.compile.pass.cpp | 2 + ...p.async.bulk.commit_group.compile.pass.cpp | 2 + .../ptx/ptx.cp.async.bulk.compile.pass.cpp | 2 + ...x.cp.async.bulk.multicast.compile.pass.cpp | 2 + .../ptx.cp.async.bulk.tensor.compile.pass.cpp | 2 + ...ync.bulk.tensor.multicast.compile.pass.cpp | 2 + ....cp.async.bulk.wait_group.compile.pass.cpp | 2 + .../ptx.cp.reduce.async.bulk.compile.pass.cpp | 2 + ....reduce.async.bulk.tensor.compile.pass.cpp | 2 + .../cuda/ptx/ptx.fence.compile.pass.cpp | 2 + .../cuda/ptx/ptx.get_sreg.compile.pass.cpp | 2 + .../cuda/ptx/ptx.getctarank.compile.pass.cpp | 2 + .../ptx/ptx.mbarrier.arrive.compile.pass.cpp | 2 + .../ptx/ptx.mbarrier.init.compile.pass.cpp | 2 + .../ptx/ptx.mbarrier.wait.compile.pass.cpp | 6 +- .../cuda/ptx/ptx.red.async.compile.pass.cpp | 2 + .../cuda/ptx/ptx.st.async.compile.pass.cpp | 2 + ...x.tensormap.cp_fenceproxy.compile.pass.cpp | 2 + .../ptx.tensormap.replace.compile.pass.cpp | 2 + 155 files changed, 58115 insertions(+), 2683 deletions(-) create mode 100644 docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/elect_sync.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst rename docs/libcudacxx/ptx/instructions/generated/{special_registers.rst => get_sreg.rst} (83%) create mode 100644 docs/libcudacxx/ptx/instructions/generated/mapa.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/multimem_red.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/multimem_st.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/st_bulk.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst new file mode 100644 index 00000000000..a24093ac7b6 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst @@ -0,0 +1,63 @@ +.. + This file was automatically generated. Do not edit. + +barrier.cluster.arrive.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_arrive( + cuda::ptx::dot_aligned_t); + +barrier.cluster.wait.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_wait( + cuda::ptx::dot_aligned_t); + +barrier.cluster.arrive.release.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 + // .sem = { .release } + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::dot_aligned_t); + +barrier.cluster.arrive.relaxed.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 + // .sem = { .relaxed } + // .aligned = { .aligned } + // Marked volatile + template + __device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_relaxed_t, + cuda::ptx::dot_aligned_t); + +barrier.cluster.wait.acquire.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 + // .sem = { .acquire } + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::dot_aligned_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst b/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst new file mode 100644 index 00000000000..b372c5bf33e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst @@ -0,0 +1,68 @@ +.. + This file was automatically generated. Do not edit. + +clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100 + template + __device__ static inline void clusterlaunchcontrol_try_cancel( + void* addr, + uint64_t* smem_bar); + +clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void clusterlaunchcontrol_try_cancel_multicast( + void* addr, + uint64_t* smem_bar); + +clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; // PTX ISA 86, SM_100 + template = true> + __device__ static inline bool clusterlaunchcontrol_query_cancel_is_canceled( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline void clusterlaunchcontrol_query_cancel_get_first_ctaid( + B32 (&block_dim)[4], + B128 try_cancel_response); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst index 4883d8495eb..2bb334f1971 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90 + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -17,11 +17,27 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes const uint32_t& size, uint64_t* smem_bar); +cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar); + cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } template @@ -37,7 +53,7 @@ cp.async.bulk.global.shared::cta.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 + // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -47,3 +63,19 @@ cp.async.bulk.global.shared::cta.bulk_group void* dstMem, const void* srcMem, const uint32_t& size); + +cp.async.bulk.global.shared::cta.bulk_group.cp_mask +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100 + // .dst = { .global } + // .src = { .shared::cta } + template + __device__ static inline void cp_async_bulk_cp_mask( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + const uint16_t& byteMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst index af027c0b623..396a04e468b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst index 1c21efdd0a3..9d44a10800b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst @@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -17,11 +17,63 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[1], uint64_t* smem_bar); +cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); + +cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); + +cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); + cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -36,7 +88,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -48,11 +100,63 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[2], uint64_t* smem_bar); +cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); + cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -67,7 +171,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -79,11 +183,63 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[3], uint64_t* smem_bar); +cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); + +cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); + +cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); + cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -98,7 +254,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -110,11 +266,63 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[4], uint64_t* smem_bar); +cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); + +cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); + +cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); + cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -129,7 +337,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -141,11 +349,63 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[5], uint64_t* smem_bar); +cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst new file mode 100644 index 00000000000..971f0213cb0 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst @@ -0,0 +1,124 @@ +.. + This file was automatically generated. Do not edit. + +cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .global } + // .src = { .shared::cta } + template + __device__ static inline void cp_async_bulk_tensor_tile_scatter4( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst index ac33a05b69f..8ea38a2e0ad 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst @@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -18,11 +18,49 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -35,11 +73,49 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -52,11 +128,49 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -69,11 +183,49 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -85,3 +237,41 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[5], uint64_t* smem_bar, const uint16_t& ctaMask); + +cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst new file mode 100644 index 00000000000..73ce222a9ec --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst @@ -0,0 +1,11 @@ +.. + This file was automatically generated. Do not edit. + +cp.async.mbarrier.arrive.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80 + template + __device__ static inline void cp_async_mbarrier_arrive( + uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst new file mode 100644 index 00000000000..31b7a2e5a2b --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst @@ -0,0 +1,11 @@ +.. + This file was automatically generated. Do not edit. + +cp.async.mbarrier.arrive.noinc.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80 + template + __device__ static inline void cp_async_mbarrier_arrive_noinc( + uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst index b043eb9f456..8228b69ed41 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst @@ -10,7 +10,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.an // .src = { .shared::cta } // .type = { .b32 } // .op = { .and } - template + template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -29,7 +29,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or // .src = { .shared::cta } // .type = { .b32 } // .op = { .or } - template + template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -48,7 +48,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xo // .src = { .shared::cta } // .type = { .b32 } // .op = { .xor } - template + template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst new file mode 100644 index 00000000000..bc909c54319 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst @@ -0,0 +1,11 @@ +.. + This file was automatically generated. Do not edit. + +elect.sync +^^^^^^^^^^ +.. code:: cuda + + // elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 + template + __device__ static inline bool elect_sync( + const uint32_t& membermask); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst index ed21fa80b6e..50137394587 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst @@ -5,94 +5,190 @@ fence.sc.cta ^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); fence.sc.gpu ^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); fence.sc.sys ^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); +fence.sc.cluster +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .sc } + // .scope = { .cluster } + template + __device__ static inline void fence( + cuda::ptx::sem_sc_t, + cuda::ptx::scope_cluster_t); + fence.acq_rel.cta ^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .acq_rel } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_t scope); fence.acq_rel.gpu ^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .acq_rel } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_t scope); fence.acq_rel.sys ^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .acq_rel } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_t scope); -fence.sc.cluster -^^^^^^^^^^^^^^^^ +fence.acq_rel.cluster +^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .acq_rel } // .scope = { .cluster } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_cluster_t); -fence.acq_rel.cluster +fence.acquire.cta +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.acquire.cluster ^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } - // .scope = { .cluster } - template + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_cluster_t); + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.acquire.gpu +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.acquire.sys +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.release.cta +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.release.cluster +^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.release.gpu +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.release.sys +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst index 8376e96ce6b..9f4000b675e 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst @@ -13,7 +13,7 @@ fence.proxy.async.global ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( @@ -23,7 +23,7 @@ fence.proxy.async.shared::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( @@ -33,7 +33,7 @@ fence.proxy.async.shared::cta ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst new file mode 100644 index 00000000000..e67c4852355 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst @@ -0,0 +1,30 @@ +.. + This file was automatically generated. Do not edit. + +fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .space = { .shared::cluster } + // .scope = { .cluster } + template + __device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); + +fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .space = { .shared::cta } + // .scope = { .cluster } + template + __device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst b/docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst new file mode 100644 index 00000000000..bae82190e25 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst @@ -0,0 +1,30 @@ +.. + This file was automatically generated. Do not edit. + +fence.acquire.sync_restrict::shared::cluster.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .space = { .shared::cluster } + // .scope = { .cluster } + template + __device__ static inline void fence_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); + +fence.release.sync_restrict::shared::cta.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .space = { .shared::cta } + // .scope = { .cluster } + template + __device__ static inline void fence_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst b/docs/libcudacxx/ptx/instructions/generated/get_sreg.rst similarity index 83% rename from docs/libcudacxx/ptx/instructions/generated/special_registers.rst rename to docs/libcudacxx/ptx/instructions/generated/get_sreg.rst index aa1add84781..9582c4384ff 100644 --- a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst +++ b/docs/libcudacxx/ptx/instructions/generated/get_sreg.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + tid.x ^^^^^ .. code:: cuda // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_tid_x(); tid.y @@ -11,7 +14,7 @@ tid.y .. code:: cuda // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_tid_y(); tid.z @@ -19,7 +22,7 @@ tid.z .. code:: cuda // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_tid_z(); ntid.x @@ -27,7 +30,7 @@ ntid.x .. code:: cuda // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ntid_x(); ntid.y @@ -35,7 +38,7 @@ ntid.y .. code:: cuda // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ntid_y(); ntid.z @@ -43,7 +46,7 @@ ntid.z .. code:: cuda // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ntid_z(); laneid @@ -51,7 +54,7 @@ laneid .. code:: cuda // mov.u32 sreg_value, %%laneid; // PTX ISA 13 - template + template __device__ static inline uint32_t get_sreg_laneid(); warpid @@ -59,7 +62,7 @@ warpid .. code:: cuda // mov.u32 sreg_value, %%warpid; // PTX ISA 13 - template + template __device__ static inline uint32_t get_sreg_warpid(); nwarpid @@ -67,7 +70,7 @@ nwarpid .. code:: cuda // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_nwarpid(); ctaid.x @@ -75,7 +78,7 @@ ctaid.x .. code:: cuda // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ctaid_x(); ctaid.y @@ -83,7 +86,7 @@ ctaid.y .. code:: cuda // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ctaid_y(); ctaid.z @@ -91,7 +94,7 @@ ctaid.z .. code:: cuda // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ctaid_z(); nctaid.x @@ -99,7 +102,7 @@ nctaid.x .. code:: cuda // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_nctaid_x(); nctaid.y @@ -107,7 +110,7 @@ nctaid.y .. code:: cuda // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_nctaid_y(); nctaid.z @@ -115,7 +118,7 @@ nctaid.z .. code:: cuda // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_nctaid_z(); smid @@ -123,7 +126,7 @@ smid .. code:: cuda // mov.u32 sreg_value, %%smid; // PTX ISA 13 - template + template __device__ static inline uint32_t get_sreg_smid(); nsmid @@ -131,7 +134,7 @@ nsmid .. code:: cuda // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_nsmid(); gridid @@ -139,7 +142,7 @@ gridid .. code:: cuda // mov.u64 sreg_value, %%gridid; // PTX ISA 30 - template + template __device__ static inline uint64_t get_sreg_gridid(); is_explicit_cluster @@ -147,7 +150,7 @@ is_explicit_cluster .. code:: cuda // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 - template + template __device__ static inline bool get_sreg_is_explicit_cluster(); clusterid.x @@ -155,7 +158,7 @@ clusterid.x .. code:: cuda // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_clusterid_x(); clusterid.y @@ -163,7 +166,7 @@ clusterid.y .. code:: cuda // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_clusterid_y(); clusterid.z @@ -171,7 +174,7 @@ clusterid.z .. code:: cuda // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_clusterid_z(); nclusterid.x @@ -179,7 +182,7 @@ nclusterid.x .. code:: cuda // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_nclusterid_x(); nclusterid.y @@ -187,7 +190,7 @@ nclusterid.y .. code:: cuda // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_nclusterid_y(); nclusterid.z @@ -195,7 +198,7 @@ nclusterid.z .. code:: cuda // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_nclusterid_z(); cluster_ctaid.x @@ -203,7 +206,7 @@ cluster_ctaid.x .. code:: cuda // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); cluster_ctaid.y @@ -211,7 +214,7 @@ cluster_ctaid.y .. code:: cuda // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); cluster_ctaid.z @@ -219,7 +222,7 @@ cluster_ctaid.z .. code:: cuda // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); cluster_nctaid.x @@ -227,7 +230,7 @@ cluster_nctaid.x .. code:: cuda // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); cluster_nctaid.y @@ -235,7 +238,7 @@ cluster_nctaid.y .. code:: cuda // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); cluster_nctaid.z @@ -243,7 +246,7 @@ cluster_nctaid.z .. code:: cuda // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); cluster_ctarank @@ -251,7 +254,7 @@ cluster_ctarank .. code:: cuda // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctarank(); cluster_nctarank @@ -259,7 +262,7 @@ cluster_nctarank .. code:: cuda // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctarank(); lanemask_eq @@ -267,7 +270,7 @@ lanemask_eq .. code:: cuda // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_eq(); lanemask_le @@ -275,7 +278,7 @@ lanemask_le .. code:: cuda // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_le(); lanemask_lt @@ -283,7 +286,7 @@ lanemask_lt .. code:: cuda // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_lt(); lanemask_ge @@ -291,7 +294,7 @@ lanemask_ge .. code:: cuda // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_ge(); lanemask_gt @@ -299,7 +302,7 @@ lanemask_gt .. code:: cuda // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_gt(); clock @@ -307,7 +310,7 @@ clock .. code:: cuda // mov.u32 sreg_value, %%clock; // PTX ISA 10 - template + template __device__ static inline uint32_t get_sreg_clock(); clock_hi @@ -315,7 +318,7 @@ clock_hi .. code:: cuda // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 - template + template __device__ static inline uint32_t get_sreg_clock_hi(); clock64 @@ -323,7 +326,7 @@ clock64 .. code:: cuda // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 - template + template __device__ static inline uint64_t get_sreg_clock64(); globaltimer @@ -331,7 +334,7 @@ globaltimer .. code:: cuda // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 - template + template __device__ static inline uint64_t get_sreg_globaltimer(); globaltimer_lo @@ -339,7 +342,7 @@ globaltimer_lo .. code:: cuda // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 - template + template __device__ static inline uint32_t get_sreg_globaltimer_lo(); globaltimer_hi @@ -347,7 +350,7 @@ globaltimer_hi .. code:: cuda // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 - template + template __device__ static inline uint32_t get_sreg_globaltimer_hi(); total_smem_size @@ -355,7 +358,7 @@ total_smem_size .. code:: cuda // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 - template + template __device__ static inline uint32_t get_sreg_total_smem_size(); aggr_smem_size @@ -363,7 +366,7 @@ aggr_smem_size .. code:: cuda // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 - template + template __device__ static inline uint32_t get_sreg_aggr_smem_size(); dynamic_smem_size @@ -371,7 +374,7 @@ dynamic_smem_size .. code:: cuda // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 - template + template __device__ static inline uint32_t get_sreg_dynamic_smem_size(); current_graph_exec @@ -379,5 +382,5 @@ current_graph_exec .. code:: cuda // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 - template + template __device__ static inline uint64_t get_sreg_current_graph_exec(); diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst index 374c182576f..19b3783086c 100644 --- a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst +++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst @@ -5,7 +5,7 @@ getctarank.shared::cluster.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 + // getctarank.space.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } template __device__ static inline uint32_t getctarank( diff --git a/docs/libcudacxx/ptx/instructions/generated/mapa.rst b/docs/libcudacxx/ptx/instructions/generated/mapa.rst new file mode 100644 index 00000000000..4ffc70d85d9 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mapa.rst @@ -0,0 +1,14 @@ +.. + This file was automatically generated. Do not edit. + +mapa.shared::cluster.u32 +^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 + // .space = { .shared::cluster } + template + __device__ static inline Tp* mapa( + cuda::ptx::space_cluster_t, + const Tp* addr, + uint32_t target_cta); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst index 21436e2b3ca..fea199e4747 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst @@ -24,7 +24,7 @@ mbarrier.arrive.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -39,7 +39,7 @@ mbarrier.arrive.release.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -54,7 +54,7 @@ mbarrier.arrive.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -70,7 +70,7 @@ mbarrier.arrive.release.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -86,7 +86,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -101,7 +101,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -112,3 +112,96 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); + +mbarrier.arrive.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr); + +mbarrier.arrive.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr); + +mbarrier.arrive.relaxed.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90 + // .space = { .shared::cluster } + // .sem = { .relaxed } + // .scope = { .cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.relaxed.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90 + // .space = { .shared::cluster } + // .sem = { .relaxed } + // .scope = { .cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst index 47c56eca31a..318a7eb5b98 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst @@ -5,7 +5,7 @@ mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -21,7 +21,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -37,7 +37,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 + // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -48,3 +48,51 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& tx_count); + +mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& txCount); + +mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& txCount); + +mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90 + // .space = { .shared::cluster } + // .sem = { .relaxed } + // .scope = { .cluster } + template + __device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& txCount); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst index d16b2ac07ac..88ec36b43ac 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst @@ -15,7 +15,7 @@ mbarrier.test_wait.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -29,7 +29,7 @@ mbarrier.test_wait.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -38,3 +38,33 @@ mbarrier.test_wait.acquire.cluster.shared::cta.b64 cuda::ptx::scope_t scope, uint64_t* addr, const uint64_t& state); + +mbarrier.test_wait.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); + +mbarrier.test_wait.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst index ec464b3398b..1496d6cbccb 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst @@ -15,7 +15,7 @@ mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -29,7 +29,7 @@ mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -38,3 +38,33 @@ mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 cuda::ptx::scope_t scope, uint64_t* addr, const uint32_t& phaseParity); + +mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst index 3dfdba46861..4d319a5b1e3 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst @@ -26,7 +26,7 @@ mbarrier.try_wait.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -40,7 +40,7 @@ mbarrier.try_wait.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -54,7 +54,7 @@ mbarrier.try_wait.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -69,7 +69,7 @@ mbarrier.try_wait.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -79,3 +79,65 @@ mbarrier.try_wait.acquire.cluster.shared::cta.b64 uint64_t* addr, const uint64_t& state, const uint32_t& suspendTimeHint); + +mbarrier.try_wait.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); + +mbarrier.try_wait.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst index 4e7af4bace5..6a51704cab4 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst @@ -26,7 +26,7 @@ mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -40,7 +40,7 @@ mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -54,7 +54,7 @@ mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -69,7 +69,7 @@ mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -79,3 +79,65 @@ mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 uint64_t* addr, const uint32_t& phaseParity, const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst b/docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst new file mode 100644 index 00000000000..cd9f32bf5f0 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst @@ -0,0 +1,2396 @@ +.. + This file was automatically generated. Do not edit. + +multimem.ld_reduce.weak.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.weak.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.weak.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.weak.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.weak.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.weak.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.weak.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.weak.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.weak.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.weak.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.weak.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.weak.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.weak.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .and } + template = true> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.weak.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .or } + template = true> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.weak.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .xor } + template = true> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.weak.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .and } + template = true> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.weak.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .or } + template = true> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.weak.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .xor } + template = true> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/multimem_red.rst b/docs/libcudacxx/ptx/instructions/generated/multimem_red.rst new file mode 100644 index 00000000000..095efaef45c --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/multimem_red.rst @@ -0,0 +1,2306 @@ +.. + This file was automatically generated. Do not edit. + +multimem.red.relaxed.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); diff --git a/docs/libcudacxx/ptx/instructions/generated/multimem_st.rst b/docs/libcudacxx/ptx/instructions/generated/multimem_st.rst new file mode 100644 index 00000000000..00695328b76 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/multimem_st.rst @@ -0,0 +1,250 @@ +.. + This file was automatically generated. Do not edit. + +multimem.st.weak.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .weak } + template = true> + __device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B32* addr, + B32 val); + +multimem.st.relaxed.cta.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.relaxed.cluster.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.relaxed.gpu.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.relaxed.sys.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.cta.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.cluster.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.gpu.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.sys.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.weak.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .weak } + template = true> + __device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B64* addr, + B64 val); + +multimem.st.relaxed.cta.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.relaxed.cluster.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.relaxed.gpu.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.relaxed.sys.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.cta.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.cluster.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.gpu.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.sys.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); diff --git a/docs/libcudacxx/ptx/instructions/generated/red_async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst index 658fe0a8f44..c575b808401 100644 --- a/docs/libcudacxx/ptx/instructions/generated/red_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst @@ -5,7 +5,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } template @@ -19,7 +19,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } template @@ -33,7 +33,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .min } template @@ -47,7 +47,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .max } template @@ -61,7 +61,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .add } template @@ -75,7 +75,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .min } template @@ -89,7 +89,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .max } template @@ -103,7 +103,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .add } template @@ -117,10 +117,10 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .b32 } // .op = { .and } - template + template = true> __device__ static inline void red_async( cuda::ptx::op_and_op_t, B32* dest, @@ -131,10 +131,10 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .b32 } // .op = { .or } - template + template = true> __device__ static inline void red_async( cuda::ptx::op_or_op_t, B32* dest, @@ -145,10 +145,10 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .b32 } // .op = { .xor } - template + template = true> __device__ static inline void red_async( cuda::ptx::op_xor_op_t, B32* dest, @@ -159,7 +159,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u64 } // .op = { .add } template @@ -173,7 +173,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } template __device__ static inline void red_async( diff --git a/docs/libcudacxx/ptx/instructions/generated/st_async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst index d00a152cf29..8cfc21ba0b5 100644 --- a/docs/libcudacxx/ptx/instructions/generated/st_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst @@ -5,7 +5,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.type [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -17,7 +17,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.type [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -29,7 +29,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.type [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -41,7 +41,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.type [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -54,7 +54,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 .. code:: cuda // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 - template + template = true> __device__ static inline void st_async( B32* addr, const B32 (&value)[4], diff --git a/docs/libcudacxx/ptx/instructions/generated/st_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/st_bulk.rst new file mode 100644 index 00000000000..817d3875fdc --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/st_bulk.rst @@ -0,0 +1,13 @@ +.. + This file was automatically generated. Do not edit. + +st.bulk.weak.shared::cta +^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // st.bulk.weak.shared::cta [addr], size, initval; // PTX ISA 86, SM_100 + template + __device__ static inline void st_bulk( + void* addr, + uint64_t size, + cuda::ptx::n32_t initval); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst new file mode 100644 index 00000000000..3bfb60fca71 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst @@ -0,0 +1,70 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.alloc.cta_group.sync.aligned.shared::cta.b32 [dst], nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_alloc( + cuda::ptx::cta_group_t cta_group, + uint32_t* dst, + const uint32_t& nCols); + +tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.alloc.cta_group.sync.aligned.shared::cta.b32 [dst], nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_alloc( + cuda::ptx::cta_group_t cta_group, + uint32_t* dst, + const uint32_t& nCols); + +tcgen05.dealloc.cta_group::1.sync.aligned.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.dealloc.cta_group.sync.aligned.b32 taddr, nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_dealloc( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + const uint32_t& nCols); + +tcgen05.dealloc.cta_group::2.sync.aligned.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.dealloc.cta_group.sync.aligned.b32 taddr, nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_dealloc( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + const uint32_t& nCols); + +tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_relinquish_alloc_permit( + cuda::ptx::cta_group_t cta_group); + +tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_relinquish_alloc_permit( + cuda::ptx::cta_group_t cta_group); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst new file mode 100644 index 00000000000..d5546fed3e5 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst @@ -0,0 +1,48 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar); + +tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar); + +tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit_multicast( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar, + uint16_t ctaMask); + +tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit_multicast( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar, + uint16_t ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst new file mode 100644 index 00000000000..b0195c5b28e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst @@ -0,0 +1,434 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.cp.cta_group::1.128x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.4x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.4x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x128b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x128b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.32x128b.warpx4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.32x128b.warpx4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst new file mode 100644 index 00000000000..ee287ea8860 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst @@ -0,0 +1,18 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.fence::before_thread_sync +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.fence::before_thread_sync; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_fence_before_thread_sync(); + +tcgen05.fence::after_thread_sync +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.fence::after_thread_sync; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_fence_after_thread_sync(); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst new file mode 100644 index 00000000000..0bb6bdbb5f5 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst @@ -0,0 +1,758 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.ld.sync.aligned.16x64b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x32bx2.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst new file mode 100644 index 00000000000..aa5a1675193 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst @@ -0,0 +1,2378 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst new file mode 100644 index 00000000000..cb900a0ec40 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst @@ -0,0 +1,4482 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst new file mode 100644 index 00000000000..54e665ed3cc --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst @@ -0,0 +1,24 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.shift.cta_group::1.down +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.shift.cta_group.down [taddr]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_shift_down( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr); + +tcgen05.shift.cta_group::2.down +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.shift.cta_group.down [taddr]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_shift_down( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst new file mode 100644 index 00000000000..3147a1757d8 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst @@ -0,0 +1,758 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.st.sync.aligned.16x64b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x64b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x64b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x64b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x64b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x64b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x64b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x64b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x128b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x128b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x128b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x128b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x128b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x128b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x128b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x256b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x256b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x256b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x256b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x256b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x256b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.32x32b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.32x32b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.32x32b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.32x32b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.32x32b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.32x32b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.32x32b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.32x32b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x32bx2.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x32bx2.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x32bx2.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x32bx2.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x32bx2.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x32bx2.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x32bx2.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x32bx2.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst new file mode 100644 index 00000000000..ec48818eecc --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst @@ -0,0 +1,18 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.wait::ld.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.wait::ld.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_wait_ld(); + +tcgen05.wait::st.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.wait::st.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_wait_st(); diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst index a8c4a260782..fbf010d6009 100644 --- a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst @@ -5,9 +5,9 @@ tensormap.replace.tile.global_address.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_global_t, void* tm_addr, @@ -17,9 +17,9 @@ tensormap.replace.tile.global_address.shared::cta.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_shared_t, void* tm_addr, @@ -29,9 +29,9 @@ tensormap.replace.tile.rank.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_global_t, void* tm_addr, @@ -41,9 +41,9 @@ tensormap.replace.tile.rank.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_shared_t, void* tm_addr, @@ -53,9 +53,9 @@ tensormap.replace.tile.box_dim.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -66,9 +66,9 @@ tensormap.replace.tile.box_dim.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -79,9 +79,9 @@ tensormap.replace.tile.global_dim.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -92,9 +92,9 @@ tensormap.replace.tile.global_dim.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -105,9 +105,9 @@ tensormap.replace.tile.global_stride.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_global_t, void* tm_addr, @@ -118,9 +118,9 @@ tensormap.replace.tile.global_stride.shared::cta.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_shared_t, void* tm_addr, @@ -131,9 +131,35 @@ tensormap.replace.tile.element_stride.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> + __device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +tensormap.replace.tile.element_stride.shared::cta.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a + // .space = { .shared::cta } + template = true> + __device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +tensormap.replace.tile.element_stride.global.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a + // .space = { .global } + template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_global_t, void* tm_addr, @@ -144,9 +170,9 @@ tensormap.replace.tile.element_stride.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_shared_t, void* tm_addr, @@ -157,7 +183,7 @@ tensormap.replace.tile.elemtype.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_elemtype( @@ -169,7 +195,7 @@ tensormap.replace.tile.elemtype.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_elemtype( @@ -181,7 +207,7 @@ tensormap.replace.tile.interleave_layout.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_interleave_layout( @@ -193,7 +219,7 @@ tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_interleave_layout( @@ -205,7 +231,7 @@ tensormap.replace.tile.swizzle_mode.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -217,7 +243,7 @@ tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -229,7 +255,7 @@ tensormap.replace.tile.fill_mode.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_fill_mode( @@ -241,10 +267,34 @@ tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_fill_mode( cuda::ptx::space_shared_t, void* tm_addr, cuda::ptx::n32_t new_val); + +tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a + // .space = { .global } + template + __device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a + // .space = { .shared::cta } + template + __device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); diff --git a/docs/libcudacxx/ptx/instructions/special_registers.rst b/docs/libcudacxx/ptx/instructions/special_registers.rst index 1e9597fa726..1981f7fb908 100644 --- a/docs/libcudacxx/ptx/instructions/special_registers.rst +++ b/docs/libcudacxx/ptx/instructions/special_registers.rst @@ -6,4 +6,4 @@ Special registers - PTX ISA: `Special Register `__ -.. include:: generated/special_registers.rst +.. include:: generated/get_sreg.rst diff --git a/docs/repo.toml b/docs/repo.toml index 7ff29fd6eba..08ce4e58775 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -54,7 +54,7 @@ api_output_directory = "api" use_fast_doxygen_conversion = true sphinx_generate_doxygen_groups = true sphinx_generate_doxygen_pages = true -sphinx_exclude_patterns = [] +sphinx_exclude_patterns = ['ptx/instructions/generated'] [repo_docs.projects.cub] name = "CUB" diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h index c8ce41c0a20..75a72db7024 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h @@ -14,12 +14,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_ template _CCCL_DEVICE static inline void barrier_cluster_arrive() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive;" : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 780 @@ -34,12 +34,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_be template _CCCL_DEVICE static inline void barrier_cluster_wait() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait;" : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 780 @@ -56,13 +56,13 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_ template _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) { - // __sem == sem_release (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.release;" : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.release;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -79,13 +79,13 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_ template _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) { - // __sem == sem_relaxed (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.relaxed;" : : :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_relaxed (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.relaxed;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -102,13 +102,13 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_be template _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) { - // __sem == sem_acquire (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait.acquire;" : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +// __sem == sem_acquire (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait.acquire;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h new file mode 100644 index 00000000000..80fe3796e69 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h @@ -0,0 +1,130 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ +#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ + +/* +// barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(dot_aligned_t) +{ +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait( + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait(dot_aligned_t) +{ +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 +// .sem = { .release } +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t, dot_aligned_t) +{ +// __sem == sem_release (due to parameter type constraint) +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.release.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 +// .sem = { .relaxed } +// .aligned = { .aligned } +// Marked volatile +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_relaxed_t, + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t, dot_aligned_t) +{ +// __sem == sem_relaxed (due to parameter type constraint) +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.relaxed.aligned;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 +// .sem = { .acquire } +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t, dot_aligned_t) +{ +// __sem == sem_acquire (due to parameter type constraint) +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait.acquire.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h b/libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h new file mode 100644 index 00000000000..19e3f92bd13 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h @@ -0,0 +1,240 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CLUSTERLAUNCHCONTROL_H_ +#define _CUDA_PTX_GENERATED_CLUSTERLAUNCHCONTROL_H_ + +/* +// clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], [smem_bar]; // PTX ISA +86, SM_100 template +__device__ static inline void clusterlaunchcontrol_try_cancel( + void* addr, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_clusterlaunchcontrol_try_cancel_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void clusterlaunchcontrol_try_cancel(void* __addr, _CUDA_VSTD::uint64_t* __smem_bar) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [%0], [%1];" + : + : "r"(__as_ptr_smem(__addr)), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_try_cancel_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a template +__device__ static inline void clusterlaunchcontrol_try_cancel_multicast( + void* addr, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_try_cancel_multicast_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void clusterlaunchcontrol_try_cancel_multicast(void* __addr, _CUDA_VSTD::uint64_t* __smem_bar) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 " + "[%0], [%1];" + : + : "r"(__as_ptr_smem(__addr)), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_try_cancel_multicast_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; // PTX ISA 86, SM_100 +template = true> +__device__ static inline bool clusterlaunchcontrol_query_cancel_is_canceled( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_clusterlaunchcontrol_query_cancel_is_canceled_is_not_supported_before_SM_100__(); +template = true> +_CCCL_DEVICE static inline bool clusterlaunchcontrol_query_cancel_is_canceled(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __pred_is_canceled; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "{\n\t .reg .pred P_OUT; \n\t" + "clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 P_OUT, B128_try_cancel_response;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}\n\t" + "}" + : "=r"(__pred_is_canceled) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return static_cast(__pred_is_canceled); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_is_canceled_is_not_supported_before_SM_100__(); + return false; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_x_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline _B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __ret_dim; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 %0, B128_try_cancel_response;\n\t" + "}" + : "=r"(__ret_dim) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return *reinterpret_cast<_B32*>(&__ret_dim); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_x_is_not_supported_before_SM_100__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_y_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline _B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __ret_dim; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 %0, B128_try_cancel_response;\n\t" + "}" + : "=r"(__ret_dim) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return *reinterpret_cast<_B32*>(&__ret_dim); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_y_is_not_supported_before_SM_100__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_z_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline _B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __ret_dim; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 %0, B128_try_cancel_response;\n\t" + "}" + : "=r"(__ret_dim) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return *reinterpret_cast<_B32*>(&__ret_dim); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_z_is_not_supported_before_SM_100__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline void clusterlaunchcontrol_query_cancel_get_first_ctaid( + B32 (&block_dim)[4], + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline void +clusterlaunchcontrol_query_cancel_get_first_ctaid(_B32 (&__block_dim)[4], _B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%4, %5}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 {%0, %1, %2, %3}, B128_try_cancel_response;\n\t" + "}" + : "=r"(__block_dim[0]), "=r"(__block_dim[1]), "=r"(__block_dim[2]), "=r"(__block_dim[3]) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_CLUSTERLAUNCHCONTROL_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h index d2196402e7a..a9aa3534611 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h @@ -4,8 +4,7 @@ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ /* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, -SM_90 +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -28,23 +27,60 @@ _CCCL_DEVICE static inline void cp_async_bulk( const _CUDA_VSTD::uint32_t& __size, _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // " - "1a. unicast" : : "r"(__as_ptr_smem(__dstMem)), - "l"(__as_ptr_gmem(__srcMem)), - "r"(__size), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } template @@ -67,23 +103,25 @@ _CCCL_DEVICE static inline void cp_async_bulk( const _CUDA_VSTD::uint32_t& __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. " : : "r"( - __as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -100,17 +138,56 @@ template _CCCL_DEVICE static inline void cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. " : : "l"(__as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_cp_mask( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + const uint16_t& byteMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_cp_mask_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_cp_mask( + space_global_t, + space_shared_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + const _CUDA_VSTD::uint16_t& __byteMask) +{ +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%0], [%1], %2, %3;" + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size), "h"(__byteMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_cp_mask_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h index 3c32743e977..3b906fd6922 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h @@ -13,12 +13,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_suppor template _CCCL_DEVICE static inline void cp_async_bulk_commit_group() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.commit_group;" : : :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.commit_group;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h index f54bf8bbdeb..7ac386343b9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h @@ -4,8 +4,8 @@ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ /* -// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], -ctaMask; // 1. PTX ISA 80, SM_90a +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; +// PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -19,7 +19,7 @@ __device__ static inline void cp_async_bulk( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk( space_cluster_t, @@ -30,19 +30,22 @@ _CCCL_DEVICE static inline void cp_async_bulk( _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], " - "%4; // 1. " : : "r"(__as_ptr_smem(__dstMem)), - "l"(__as_ptr_gmem(__srcMem)), - "r"(__size), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__as_ptr_gmem(__srcMem)), + "r"(__size), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h index f7c60bb72f6..2326346f547 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h @@ -4,8 +4,8 @@ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ /* -// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1a. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -28,23 +28,116 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[1], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// " - "1a." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2}], " + "[%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2}], " + "[%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -65,22 +158,23 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[1], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a." : : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2];" + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1b. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -103,24 +197,132 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[2], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], " - "[%4];// 1b." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, " + "%3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, " + "%3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -141,23 +343,23 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[2], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b." : : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3];" + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1c. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -180,25 +382,136 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[3], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], " - "[%5];// 1c." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, " + "%4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, " + "%4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -219,25 +532,27 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[3], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c." : : "l"( - __tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1d. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -260,26 +575,141 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[4], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " - "%5}], [%6];// 1d." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5}], " + "[%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5}], [%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, " + "%4, %5}], [%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, " + "%4, %5}], [%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -300,26 +730,28 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[4], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d." : : "l"( - __tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1e. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -342,27 +774,146 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[5], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " - "%6}], [%7];// 1e." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " + "%6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, %6}], " + "[%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, " + "%4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, " + "%4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -383,21 +934,23 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[5], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e." : : "l"( - __tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h new file mode 100644 index 00000000000..f376f1b48c3 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h @@ -0,0 +1,288 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " + "%5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, " + "{%2, %3, %4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, " + "{%2, %3, %4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster " + "[%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], +[tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster." + "cta_group::1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster." + "cta_group::2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, +SM_100a, SM_101a +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor_tile_scatter4( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_scatter4( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + const void* __srcMem) +{ +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h index 56c199d39ff..b0d845b92a0 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h @@ -5,7 +5,7 @@ /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -19,7 +19,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -29,26 +29,95 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[1], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2}], [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2}], [%3], %4; // 2a." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2}], [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2}], [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -62,7 +131,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -72,27 +141,98 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[2], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3}], [%4], %5;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3}], [%4], %5; // 2b." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3}], [%4], %5;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3}], [%4], %5;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -106,7 +246,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -116,28 +256,101 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[3], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3, %4}], [%5], %6;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4}], [%5], %6; // 2c." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3, %4}], [%5], %6;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3, %4}], [%5], %6;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -151,7 +364,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -161,29 +374,104 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[4], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3, %4, %5}], [%6], %7;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -197,7 +485,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -207,24 +495,101 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[5], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e." : : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h index 85b1507f721..b0373a3e6a7 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h @@ -14,12 +14,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supporte template _CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group %0;" : : "n"(__N.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.wait_group %0;" : : "n"(__N.value) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -34,12 +34,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_sup template _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(__N.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(__N.value) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h new file mode 100644 index 00000000000..b2bf07247c1 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_H_ + +/* +// cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80 +template +__device__ static inline void cp_async_mbarrier_arrive( + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline void cp_async_mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + asm("cp.async.mbarrier.arrive.b64 [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__(); +# endif +} +#endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h new file mode 100644 index 00000000000..816a3fc63b9 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_NOINC_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_NOINC_H_ + +/* +// cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80 +template +__device__ static inline void cp_async_mbarrier_arrive_noinc( + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline void cp_async_mbarrier_arrive_noinc(_CUDA_VSTD::uint64_t* __addr) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + asm("cp.async.mbarrier.arrive.noinc.b64 [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__(); +# endif +} +#endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_NOINC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h index 9b1bf35b290..499fda57c91 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h @@ -10,7 +10,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .b32 } // .op = { .and } -template +template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -22,7 +22,7 @@ __device__ static inline void cp_reduce_async_bulk( */ #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_cluster_t, space_shared_t, @@ -32,20 +32,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_b32 (due to parameter type constraint) +// __op == op_and_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -56,7 +59,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .b32 } // .op = { .or } -template +template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -68,7 +71,7 @@ __device__ static inline void cp_reduce_async_bulk( */ #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_cluster_t, space_shared_t, @@ -78,20 +81,22 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_b32 (due to parameter type constraint) +// __op == op_or_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; // 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -102,7 +107,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .b32 } // .op = { .xor } -template +template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -114,7 +119,7 @@ __device__ static inline void cp_reduce_async_bulk( */ #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_cluster_t, space_shared_t, @@ -124,20 +129,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_b32 (due to parameter type constraint) +// __op == op_xor_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -170,20 +178,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -216,20 +227,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -262,20 +276,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -308,20 +325,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_inc (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -354,20 +374,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_dec (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -400,20 +423,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -446,20 +472,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -492,20 +521,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -538,20 +570,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -584,20 +619,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 2." : : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; // " + "2." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -625,24 +663,26 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // __space == space_global (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __op == op_and_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -670,24 +710,26 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // __space == space_global (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __op == op_or_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -715,24 +757,26 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // __space == space_global (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __op == op_xor_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -762,19 +806,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -804,19 +848,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -846,19 +890,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -888,19 +932,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_inc (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -930,19 +974,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_dec (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -972,19 +1016,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1014,19 +1058,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1056,19 +1100,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1098,19 +1142,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1140,19 +1184,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1182,19 +1226,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1224,19 +1268,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1266,19 +1310,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1303,19 +1347,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1340,19 +1384,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1382,19 +1426,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h index da5cdb6bc9b..5c177976468 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h @@ -29,19 +29,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const __nv_bfloat16* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_bf16 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -71,19 +71,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const __nv_bfloat16* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_bf16 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -113,19 +113,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const __nv_bfloat16* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_bf16 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h index 3d9d4520dcb..95d775d09e2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h @@ -24,19 +24,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f16 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -61,19 +61,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f16 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -98,19 +98,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." : : "l"( - __as_ptr_gmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f16 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h index 9ec5b2443d8..540b0e95ed5 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h @@ -34,53 +34,67 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -115,53 +129,67 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -196,85 +224,99 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -309,93 +351,107 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -430,109 +486,115 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " - "1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h new file mode 100644 index 00000000000..e8691178f14 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h @@ -0,0 +1,36 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_ELECT_SYNC_H_ +#define _CUDA_PTX_GENERATED_ELECT_SYNC_H_ + +/* +// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 +template +__device__ static inline bool elect_sync( + const uint32_t& membermask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool elect_sync(const _CUDA_VSTD::uint32_t& __membermask) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __is_elected; + asm volatile( + "{\n\t .reg .pred P_OUT; \n\t" + "elect.sync _|P_OUT, %1;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__is_elected) + : "r"(__membermask) + :); + return static_cast(__is_elected); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); + return false; +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_ELECT_SYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h index db00c4d4cba..c0bd9e9a3d2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h @@ -4,71 +4,205 @@ #define _CUDA_PTX_GENERATED_FENCE_H_ /* -// fence{.sem}.scope; // 1. PTX ISA 60, SM_70 -// .sem = { .sc, .acq_rel } +// fence.sem.scope; // 1. PTX ISA 60, SM_70 +// .sem = { .sc } // .scope = { .cta, .gpu, .sys } -template +template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); */ #if __cccl_ptx_isa >= 600 extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope) +template +_CCCL_DEVICE static inline void fence(sem_sc_t, scope_t<_Scope> __scope) { - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); + // __sem == sem_sc (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { - asm volatile("fence.sc.cta; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { - asm volatile("fence.sc.gpu; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { - asm volatile("fence.sc.sys; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { - asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { - asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { - asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_70__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.sc.cta; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.sc.gpu; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.sc.sys; // 1." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_70__(); +# endif } #endif // __cccl_ptx_isa >= 600 /* -// fence{.sem}.scope; // 2. PTX ISA 78, SM_90 -// .sem = { .sc, .acq_rel } +// fence.sem.scope; // 2. PTX ISA 78, SM_90 +// .sem = { .sc } // .scope = { .cluster } -template +template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_cluster_t); */ #if __cccl_ptx_isa >= 780 extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) +template +_CCCL_DEVICE static inline void fence(sem_sc_t, scope_cluster_t) { - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc) { - asm volatile("fence.sc.cluster; // 2." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel) { - asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_90__();)); +// __sem == sem_sc (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.sc.cluster; // 2." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 780 +/* +// fence.sem.scope; // 1. PTX ISA 60, SM_70 +// .sem = { .acq_rel } +// .scope = { .cta, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_acq_rel_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 600 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); +template +_CCCL_DEVICE static inline void fence(sem_acq_rel_t, scope_t<_Scope> __scope) +{ + // __sem == sem_acq_rel (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_70__(); +# endif +} +#endif // __cccl_ptx_isa >= 600 + +/* +// fence.sem.scope; // 2. PTX ISA 78, SM_90 +// .sem = { .acq_rel } +// .scope = { .cluster } +template +__device__ static inline void fence( + cuda::ptx::sem_acq_rel_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_acq_rel_t, scope_cluster_t) +{ +// __sem == sem_acq_rel (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +/* +// fence.sem.scope; // PTX ISA 86, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_acquire_t, scope_t<_Scope> __scope) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.acquire.cta;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.acquire.cluster;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.acquire.gpu;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.acquire.sys;" : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// fence.sem.scope; // PTX ISA 86, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_release_t, scope_t<_Scope> __scope) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.release.cta;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.release.cluster;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.release.gpu;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.release.sys;" : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h index f8c4e6cf476..6b0c8ec161d 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h @@ -17,14 +17,14 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_bef template _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.mbarrier_init.release.cluster; // 3." : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.mbarrier_init.release.cluster; // 3." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h index cc413a0f511..e520d99bfaa 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h @@ -13,12 +13,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_befor template _CCCL_DEVICE static inline void fence_proxy_alias() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("fence.proxy.alias; // 4." : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700 + asm volatile("fence.proxy.alias; // 4." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__(); +# endif } #endif // __cccl_ptx_isa >= 750 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h index 176d24ff73f..f8ee49909db 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h @@ -13,17 +13,17 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_befor template _CCCL_DEVICE static inline void fence_proxy_async() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.proxy.async; // 5." : : : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.proxy.async; // 5." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 +// fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( @@ -35,19 +35,23 @@ template _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) { static_assert(__space == space_global || __space == space_cluster || __space == space_shared, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__space == space_global) { - asm volatile("fence.proxy.async.global; // 6." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__space == space_cluster) { - asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__space == space_shared) { - asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__space == space_global) + { + asm volatile("fence.proxy.async.global; // 6." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__space == space_cluster) + { + asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__space == space_shared) + { + asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h new file mode 100644 index 00000000000..93c66063ea3 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h @@ -0,0 +1,62 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_GENERIC_SYNC_RESTRICT_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_GENERIC_SYNC_RESTRICT_H_ + +/* +// fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .acquire } +// .space = { .shared::cluster } +// .scope = { .cluster } +template +__device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async_generic_sync_restrict(sem_acquire_t, space_cluster_t, scope_cluster_t) +{ +// __sem == sem_acquire (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .release } +// .space = { .shared::cta } +// .scope = { .cluster } +template +__device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async_generic_sync_restrict(sem_release_t, space_shared_t, scope_cluster_t) +{ +// __sem == sem_release (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_GENERIC_SYNC_RESTRICT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h index 1e6119ee032..8988292b6d3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h @@ -19,21 +19,27 @@ _CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, sco { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 830 @@ -56,33 +62,39 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h new file mode 100644 index 00000000000..4930bec068b --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h @@ -0,0 +1,62 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_SYNC_RESTRICT_H_ +#define _CUDA_PTX_GENERATED_FENCE_SYNC_RESTRICT_H_ + +/* +// fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .acquire } +// .space = { .shared::cluster } +// .scope = { .cluster } +template +__device__ static inline void fence_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_sync_restrict(sem_acquire_t, space_cluster_t, scope_cluster_t) +{ +// __sem == sem_acquire (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.acquire.sync_restrict::shared::cluster.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .release } +// .space = { .shared::cta } +// .scope = { .cluster } +template +__device__ static inline void fence_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_sync_restrict(sem_release_t, space_shared_t, scope_cluster_t) +{ +// __sem == sem_release (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.release.sync_restrict::shared::cta.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_FENCE_SYNC_RESTRICT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h index da802adb9db..e5c8fa89225 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h @@ -133,13 +133,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%nwarpid;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%nwarpid;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -258,13 +260,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_S template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%nsmid;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%nsmid;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -293,17 +297,21 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supp template _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mov.pred P_OUT, %%is_explicit_cluster;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__sreg_value) : :); - return static_cast(__sreg_value);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mov.pred P_OUT, %%is_explicit_cluster;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__sreg_value) + : + :); + return static_cast(__sreg_value); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -317,13 +325,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%clusterid.x;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -337,13 +347,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%clusterid.y;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -357,13 +369,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%clusterid.z;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -377,13 +391,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_b template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%nclusterid.x;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -397,13 +413,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_b template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%nclusterid.y;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -417,13 +435,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_b template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%nclusterid.z;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -437,13 +457,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctaid.x;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -457,13 +479,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctaid.y;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -477,13 +501,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctaid.z;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -497,13 +523,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctaid.x;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -517,13 +545,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctaid.y;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -537,13 +567,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctaid.z;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -557,13 +589,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctarank;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctarank;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -577,13 +611,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctarank;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctarank;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -597,13 +633,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_eq;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_eq;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -617,13 +655,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_le;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_le;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -637,13 +677,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -657,13 +699,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_ge;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_ge;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -677,13 +721,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_gt;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -712,13 +758,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_befor template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%clock_hi;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%clock_hi;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 500 @@ -732,13 +780,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile("mov.u64 %0, %%clock64;" : "=l"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint64_t __sreg_value; + asm volatile("mov.u64 %0, %%clock64;" : "=l"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -752,13 +802,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint64_t __sreg_value; + asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 310 @@ -772,13 +824,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 310 @@ -792,13 +846,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 310 @@ -812,13 +868,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%total_smem_size;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%total_smem_size;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 410 @@ -832,13 +890,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%aggr_smem_size;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%aggr_smem_size;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 810 @@ -852,13 +912,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_suppor template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%dynamic_smem_size;" : "=r"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%dynamic_smem_size;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 410 @@ -872,13 +934,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_suppo template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_50, - (_CUDA_VSTD::uint64_t __sreg_value; asm("mov.u64 %0, %%current_graph_exec;" : "=l"(__sreg_value) : :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint64_t __sreg_value; + asm("mov.u64 %0, %%current_graph_exec;" : "=l"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h index 22bb73180dc..c78637db3e9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h @@ -4,7 +4,7 @@ #define _CUDA_PTX_GENERATED_GETCTARANK_H_ /* -// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 +// getctarank.space.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } template __device__ static inline uint32_t getctarank( @@ -16,15 +16,16 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90 template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr) { - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __dest; - asm("getctarank.shared::cluster.u32 %0, %1;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)) :); - return __dest;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("getctarank.shared::cluster.u32 %0, %1;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h new file mode 100644 index 00000000000..f93c8a62157 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h @@ -0,0 +1,33 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MAPA_H_ +#define _CUDA_PTX_GENERATED_MAPA_H_ + +/* +// mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 +// .space = { .shared::cluster } +template +__device__ static inline Tp* mapa( + cuda::ptx::space_cluster_t, + const Tp* addr, + uint32_t target_cta); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _Tp* mapa(space_cluster_t, const _Tp* __addr, _CUDA_VSTD::uint32_t __target_cta) +{ +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("mapa.shared::cluster.u32 %0, %1, %2;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)), "r"(__target_cta) :); + return __from_ptr_dsmem<_Tp>(__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mapa_is_not_supported_before_SM_90__(); + return __from_ptr_dsmem<_Tp>(0); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +#endif // _CUDA_PTX_GENERATED_MAPA_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h index c7102ebfdb5..5f7b23dbb68 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h @@ -14,14 +14,18 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_ template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; asm("mbarrier.arrive.shared.b64 %0, [%1]; " - " // 1. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)) : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared.b64 %0, [%1]; // 1. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 700 @@ -38,21 +42,23 @@ template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; " - "// 2. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), - "r"(__count) : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -71,29 +77,34 @@ mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VS { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -117,29 +128,34 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -156,21 +172,23 @@ template _CCCL_DEVICE static inline void mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; " - " // 4a. " : : "r"(__as_ptr_remote_dsmem(__addr)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " + : + : "r"(__as_ptr_remote_dsmem(__addr)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -188,18 +206,180 @@ template _CCCL_DEVICE static inline void mbarrier_arrive( sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; " - "// 4b. " : : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__count) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 +// .space = { .shared::cta } +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( + space_shared_t, + sem_relaxed_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + // __space == space_shared (due to parameter type constraint) + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.relaxed.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.relaxed.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 +// .space = { .shared::cta } +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +mbarrier_arrive(space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr) +{ + // __space == space_shared (due to parameter type constraint) + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.relaxed.cta.shared::cta.b64 %0, [%1];" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.relaxed.cluster.shared::cta.b64 %0, [%1];" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90 +// .space = { .shared::cluster } +// .sem = { .relaxed } +// .scope = { .cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive( + space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ +// __space == space_cluster (due to parameter type constraint) +// __sem == sem_relaxed (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90 +// .space = { .shared::cluster } +// .sem = { .relaxed } +// .scope = { .cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +mbarrier_arrive(space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr) +{ +// __space == space_cluster (due to parameter type constraint) +// __sem == sem_relaxed (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h index dc33b212e21..5cbcd4cb3aa 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h @@ -4,7 +4,7 @@ #define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ /* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -28,29 +28,34 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -68,18 +73,104 @@ template _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " : : "r"( - __as_ptr_remote_dsmem(__addr)), - "r"(__tx_count) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 +// .space = { .shared::cta } +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& txCount); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( + space_shared_t, + sem_relaxed_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __txCount) +{ + // __space == space_shared (due to parameter type constraint) + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90 +// .space = { .shared::cluster } +// .sem = { .relaxed } +// .scope = { .cluster } +template +__device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& txCount); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( + space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount) +{ +// __space == space_cluster (due to parameter type constraint) +// __sem == sem_relaxed (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h index 45c444c5364..2a9ebacf295 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h @@ -16,16 +16,18 @@ template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; " - "// 5. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), - "r"(__count) : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h new file mode 100644 index 00000000000..94d66b79a35 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h @@ -0,0 +1,94 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_EXPECT_TX_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_EXPECT_TX_H_ + +/* +// mbarrier.expect_tx.sem.scope.space.b64 [addr], txCount; // 1. PTX ISA 80, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline void mbarrier_expect_tx( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + uint32_t txCount); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_expect_tx( + sem_relaxed_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __txCount) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1; // 1." + : + : "r"(__as_ptr_smem(__addr)), "r"(__txCount) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.expect_tx.relaxed.cluster.shared::cta.b64 [%0], %1; // 1." + : + : "r"(__as_ptr_smem(__addr)), "r"(__txCount) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.expect_tx.sem.scope.space.b64 [addr], txCount; // 2. PTX ISA 80, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_expect_tx( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_cluster_t, + uint64_t* addr, + uint32_t txCount); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_expect_tx( + sem_relaxed_t, scope_t<_Scope> __scope, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __txCount) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.expect_tx.relaxed.cta.shared::cluster.b64 [%0], %1; // 2." + : + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.expect_tx.relaxed.cluster.shared::cluster.b64 [%0], %1; // 2." + : + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h index 6b3041de0d2..9ba345f8ff2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h @@ -15,12 +15,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM template _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(__addr)), "r"(__count) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(__addr)), "r"(__count) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__(); +# endif } #endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h index 9adc677c76d..53263270f0d 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h @@ -15,23 +15,26 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_befo template _CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "l"(__state) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 700 /* -// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX +// mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } @@ -50,31 +53,87 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_test_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_test_wait( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h index 1166b336d2d..3a281e22087 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h @@ -16,23 +16,26 @@ template _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "r"(__phaseParity) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 710 /* -// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX +// mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } @@ -51,30 +54,87 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_test_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_test_wait_parity( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h index 52fa5a4928a..c048136b87a 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h @@ -15,18 +15,21 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_befor template _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "l"(__state) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -44,25 +47,27 @@ template _CCCL_DEVICE static inline bool mbarrier_try_wait( _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "l"(__state), - "r"(__suspendTimeHint) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 /* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. -PTX ISA 80, SM_90 +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -80,36 +85,40 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. -PTX ISA 80, SM_90 +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -132,30 +141,147 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "l"(__state), + "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "l"(__state), + "r"(__suspendTimeHint) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h index aa15e255352..0d6f7d3a9df 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h @@ -16,18 +16,21 @@ template _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "r"(__phaseParity) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -45,25 +48,27 @@ template _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), - "r"(__phaseParity), - "r"(__suspendTimeHint) : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 /* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. -PTX ISA 80, SM_90 +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -81,35 +86,40 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. -PTX ISA 80, SM_90 +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -132,30 +142,148 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // PTX ISA 86, +SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait_parity( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity), + "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity), + "r"(__suspendTimeHint) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait_parity( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h new file mode 100644 index 00000000000..51de5257bba --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h @@ -0,0 +1,2148 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_ +#define _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_ + +/* +// multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::uint32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.min.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::uint32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::uint64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.min.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::uint64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::int32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + asm("multimem.ld_reduce.weak.global.min.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::int32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::int64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + asm("multimem.ld_reduce.weak.global.min.s64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::int64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::uint32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.max.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::uint32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::uint64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.max.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::uint64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::int32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + asm("multimem.ld_reduce.weak.global.max.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::int32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::int64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + asm("multimem.ld_reduce.weak.global.max.s64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::int64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::uint32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.add.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::uint32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::uint64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.add.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::uint64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::int32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + asm("multimem.ld_reduce.weak.global.add.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::int32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::int64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + asm("multimem.ld_reduce.weak.global.add.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::int64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .and } +template = true> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B32 multimem_ld_reduce(sem_weak_t, op_and_op_t, const _B32* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.and.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B32 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, const _B32* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .or } +template = true> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B32 multimem_ld_reduce(sem_weak_t, op_or_op_t, const _B32* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.or.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B32 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, const _B32* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .xor } +template = true> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B32 multimem_ld_reduce(sem_weak_t, op_xor_op_t, const _B32* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.xor.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B32 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, const _B32* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .and } +template = true> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B64 multimem_ld_reduce(sem_weak_t, op_and_op_t, const _B64* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.and.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B64 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, const _B64* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .or } +template = true> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B64 multimem_ld_reduce(sem_weak_t, op_or_op_t, const _B64* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.or.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B64 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, const _B64* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .xor } +template = true> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B64 multimem_ld_reduce(sem_weak_t, op_xor_op_t, const _B64* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.xor.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B64 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, const _B64* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h new file mode 100644 index 00000000000..1ef97121d31 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h @@ -0,0 +1,1272 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MULTIMEM_RED_H_ +#define _CUDA_PTX_GENERATED_MULTIMEM_RED_H_ + +/* +// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::uint32_t* __addr, _CUDA_VSTD::uint32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::int32_t* __addr, _CUDA_VSTD::int32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::int64_t* __addr, _CUDA_VSTD::int64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::uint32_t* __addr, _CUDA_VSTD::uint32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::int32_t* __addr, _CUDA_VSTD::int32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::int64_t* __addr, _CUDA_VSTD::int64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::uint32_t* __addr, _CUDA_VSTD::uint32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::int32_t* __addr, _CUDA_VSTD::int32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::int64_t* __addr, _CUDA_VSTD::int64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_MULTIMEM_RED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h new file mode 100644 index 00000000000..91319874243 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h @@ -0,0 +1,186 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MULTIMEM_ST_H_ +#define _CUDA_PTX_GENERATED_MULTIMEM_ST_H_ + +/* +// multimem.st.sem.global.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .weak } +template = true> +__device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline void multimem_st(sem_weak_t, _B32* __addr, _B32 __val) +{ + // __sem == sem_weak (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("multimem.st.weak.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void multimem_st(sem_t<_Sem> __sem, scope_t<_Scope> __scope, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.st.relaxed.cta.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.st.relaxed.cluster.global.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.st.relaxed.gpu.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.st.relaxed.sys.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.st.release.cta.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.st.release.cluster.global.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.st.release.gpu.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.st.release.sys.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.st.sem.global.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .weak } +template = true> +__device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline void multimem_st(sem_weak_t, _B64* __addr, _B64 __val) +{ + // __sem == sem_weak (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("multimem.st.weak.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void multimem_st(sem_t<_Sem> __sem, scope_t<_Scope> __scope, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.st.relaxed.cta.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.st.relaxed.cluster.global.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.st.relaxed.gpu.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.st.relaxed.sys.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.st.release.cta.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.st.release.cluster.global.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.st.release.gpu.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.st.release.sys.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_MULTIMEM_ST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h index 74110933270..767411d4719 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h @@ -4,8 +4,8 @@ #define _CUDA_PTX_GENERATED_RED_ASYNC_H_ /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } template @@ -21,23 +21,23 @@ template _CCCL_DEVICE static inline void red_async( op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_inc (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } template @@ -53,23 +53,23 @@ template _CCCL_DEVICE static inline void red_async( op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_dec (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .min } template @@ -85,23 +85,23 @@ template _CCCL_DEVICE static inline void red_async( op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .max } template @@ -117,23 +117,23 @@ template _CCCL_DEVICE static inline void red_async( op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .add } template @@ -149,23 +149,23 @@ template _CCCL_DEVICE static inline void red_async( op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .s32 } // .op = { .min } template @@ -181,23 +181,23 @@ template _CCCL_DEVICE static inline void red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_s32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .s32 } // .op = { .max } template @@ -213,23 +213,23 @@ template _CCCL_DEVICE static inline void red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_s32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .s32 } // .op = { .add } template @@ -245,26 +245,26 @@ template _CCCL_DEVICE static inline void red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_s32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .b32 } // .op = { .and } -template +template = true> __device__ static inline void red_async( cuda::ptx::op_and_op_t, B32* dest, @@ -273,31 +273,31 @@ __device__ static inline void red_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) { // __type == type_b32 (due to parameter type constraint) // __op == op_and_op (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__as_b32(__value)), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .b32 } // .op = { .or } -template +template = true> __device__ static inline void red_async( cuda::ptx::op_or_op_t, B32* dest, @@ -306,31 +306,31 @@ __device__ static inline void red_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) { // __type == type_b32 (due to parameter type constraint) // __op == op_or_op (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__as_b32(__value)), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .b32 } // .op = { .xor } -template +template = true> __device__ static inline void red_async( cuda::ptx::op_xor_op_t, B32* dest, @@ -339,28 +339,28 @@ __device__ static inline void red_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) { // __type == type_b32 (due to parameter type constraint) // __op == op_xor_op (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "r"(__as_b32(__value)), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u64 } // .op = { .add } template @@ -376,22 +376,22 @@ template _CCCL_DEVICE static inline void red_async( op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " : : "r"( - __as_ptr_remote_dsmem(__dest)), - "l"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } template @@ -407,16 +407,17 @@ template _CCCL_DEVICE static inline void red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar) { - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " - "intentional" : : "r"(__as_ptr_remote_dsmem(__dest)), - "l"(__value), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " + "intentional" + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h index e6c3fcf1737..e59208e59ba 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h @@ -4,7 +4,7 @@ #define _CUDA_PTX_GENERATED_ST_ASYNC_H_ /* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.type [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template @@ -19,28 +19,30 @@ template _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar) { static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.type [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template @@ -55,35 +57,37 @@ template _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar) { static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "l"(__as_b64(__value[0])), - "l"(__as_b64(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "l"(__as_b64(__value[0])), + "l"(__as_b64(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, -SM_90 template +SM_90 template = true> __device__ static inline void st_async( B32* addr, const B32 (&value)[4], @@ -91,22 +95,24 @@ __device__ static inline void st_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar) { static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // " - "3. " : : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_b32(__value[2])), - "r"(__as_b32(__value[3])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_b32(__value[2])), + "r"(__as_b32(__value[3])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h new file mode 100644 index 00000000000..bc02c785f86 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h @@ -0,0 +1,31 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_ST_BULK_H_ +#define _CUDA_PTX_GENERATED_ST_BULK_H_ + +/* +// st.bulk.weak.shared::cta [addr], size, initval; // PTX ISA 86, SM_100 +template +__device__ static inline void st_bulk( + void* addr, + uint64_t size, + cuda::ptx::n32_t initval); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_st_bulk_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void st_bulk(void* __addr, _CUDA_VSTD::uint64_t __size, n32_t<_N32> __initval) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("st.bulk.weak.shared::cta [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__addr)), "l"(__size), "n"(__initval.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_bulk_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_ST_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h new file mode 100644 index 00000000000..27ca2f86080 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h @@ -0,0 +1,105 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_ALLOC_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_ALLOC_H_ + +/* +// tcgen05.alloc.cta_group.sync.aligned.shared::cta.b32 [dst], nCols; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_alloc( + cuda::ptx::cta_group_t cta_group, + uint32_t* dst, + const uint32_t& nCols); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_alloc_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_alloc(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t* __dst, const _CUDA_VSTD::uint32_t& __nCols) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__dst)), "r"(__nCols) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__dst)), "r"(__nCols) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_alloc_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.dealloc.cta_group.sync.aligned.b32 taddr, nCols; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_dealloc( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + const uint32_t& nCols); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_dealloc_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_dealloc(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, const _CUDA_VSTD::uint32_t& __nCols) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.dealloc.cta_group::1.sync.aligned.b32 %0, %1;" : : "r"(__taddr), "r"(__nCols) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.dealloc.cta_group::2.sync.aligned.b32 %0, %1;" : : "r"(__taddr), "r"(__nCols) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_dealloc_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_relinquish_alloc_permit( + cuda::ptx::cta_group_t cta_group); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_relinquish_alloc_permit_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_relinquish_alloc_permit(cta_group_t<_Cta_Group> __cta_group) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;" : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_relinquish_alloc_permit_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_ALLOC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h new file mode 100644 index 00000000000..30865d000df --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h @@ -0,0 +1,81 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_COMMIT_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_COMMIT_H_ + +/* +// tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_commit( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_commit_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_commit(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint64_t* __smem_bar) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%0];" + : + : "r"(__as_ptr_dsmem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%0];" + : + : "r"(__as_ptr_dsmem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_commit_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_commit_multicast( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar, + uint16_t ctaMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_commit_multicast_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_commit_multicast( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint64_t* __smem_bar, _CUDA_VSTD::uint16_t __ctaMask) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%0], %1;" + : + : "r"(__as_ptr_dsmem(__smem_bar)), "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%0], %1;" + : + : "r"(__as_ptr_dsmem(__smem_bar)), "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_commit_multicast_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_COMMIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h new file mode 100644 index 00000000000..e213f9ba745 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h @@ -0,0 +1,612 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_CP_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_CP_H_ + +/* +// tcgen05.cp.cta_group.128x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x256b_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_cp_128x256b(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.4x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_4x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_4x256b_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_cp_4x256b(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.4x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.4x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_4x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x128b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x128b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x128b_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_cp_128x128b(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x128b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x128b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::02_13 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_02_13( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_02_13( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::01_23 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_01_23( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_01_23( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.32x128b.warpx4 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_32x128b_warpx4( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_32x128b_warpx4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_32x128b_warpx4( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.32x128b.warpx4 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.32x128b.warpx4 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_32x128b_warpx4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.4x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_4x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_4x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x128b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x128b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x128b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.4x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_4x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_4x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x128b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x128b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x128b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_CP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h new file mode 100644 index 00000000000..efedcf86a57 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h @@ -0,0 +1,44 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_FENCE_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_FENCE_H_ + +/* +// tcgen05.fence::before_thread_sync; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_fence_before_thread_sync(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_fence_before_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_fence_before_thread_sync() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.fence::before_thread_sync;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_fence_before_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.fence::after_thread_sync; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_fence_after_thread_sync(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_fence_after_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_fence_after_thread_sync() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.fence::after_thread_sync;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_fence_after_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h new file mode 100644 index 00000000000..e5ec1b686c2 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h @@ -0,0 +1,4446 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_LD_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_LD_H_ + +/* +// tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x1.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x2.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x4.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x1.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x2.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x4.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x2.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x4.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x1.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x2.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x4.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x1.b32 {%0}, [%1], %2;" + : "=r"(__out[0]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 {%0}, [%1], %2;" + : "=r"(__out[0]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x2.b32 {%0, %1}, [%2], %3;" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 {%0, %1}, [%2], %3;" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x4.b32 {%0, %1, %2, %3}, [%4], %5;" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4], %5;" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8], %9;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8], %9;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, " + "%15}, [%16], %17;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16], %17;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32], %33;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32], %33;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64], %65;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64], %65;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128], %129;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128], %129;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_LD_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h new file mode 100644 index 00000000000..58e3f1e8363 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h @@ -0,0 +1,3842 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_MMA_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_MMA_H_ + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // +PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // +PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_MMA_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h new file mode 100644 index 00000000000..8d09698052d --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h @@ -0,0 +1,6438 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_MMA_WS_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_MMA_WS_H_ + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_MMA_WS_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h new file mode 100644 index 00000000000..0c28ba5d888 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h @@ -0,0 +1,36 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_SHIFT_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_SHIFT_H_ + +/* +// tcgen05.shift.cta_group.down [taddr]; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_shift_down( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_shift_down_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_shift_down(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.shift.cta_group::1.down [%0];" : : "r"(__taddr) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.shift.cta_group::2.down [%0];" : : "r"(__taddr) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_shift_down_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_SHIFT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h new file mode 100644 index 00000000000..83e9d13810e --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h @@ -0,0 +1,4554 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_ST_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_ST_H_ + +/* +// tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x1.b32 [%0], {%1};" : : "r"(__taddr), "r"(__as_b32(__values[0])) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [%0], {%1};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x2.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x4.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x64.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x128.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x1.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x2.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x4.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x64.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x1.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x2.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x4.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x1.b32 [%0], {%1};" : : "r"(__taddr), "r"(__as_b32(__values[0])) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [%0], {%1};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x2.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x4.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x64.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x128.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x1.b32 [%0], %1, {%2};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [%0], %1, {%2};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x2.b32 [%0], %1, {%2, %3};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [%0], %1, {%2, %3};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x4.b32 [%0], %1, {%2, %3, %4, %5};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x8.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x16.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, " + "%15, %16, %17};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x32.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x64.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x128.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, " + "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, " + "%37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, " + "%59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, " + "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, " + "%103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, " + "%122, %123, %124, %125, %126, %127, %128, %129};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128, %129};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_ST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h new file mode 100644 index 00000000000..5f683c07fea --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h @@ -0,0 +1,44 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_WAIT_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_WAIT_H_ + +/* +// tcgen05.wait::ld.sync.aligned; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_wait_ld(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_wait_ld_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_wait_ld() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.wait::ld.sync.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_wait_ld_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.wait::st.sync.aligned; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_wait_st(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_wait_st_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_wait_st() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.wait::st.sync.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_wait_st_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h index b51b5185db0..db5e7dde640 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h @@ -24,37 +24,43 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h index 598b56f90b0..53c56e159f7 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h @@ -4,113 +4,127 @@ #define _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ /* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_global_t, void* tm_addr, B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "l"(__as_b64(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_shared_t, void* tm_addr, B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "l"(__as_b64(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_global_t, void* tm_addr, B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_shared_t, void* tm_addr, B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -118,28 +132,30 @@ __device__ static inline void tensormap_replace_box_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -147,29 +163,30 @@ __device__ static inline void tensormap_replace_box_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm( - "tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -177,28 +194,30 @@ __device__ static inline void tensormap_replace_global_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -206,29 +225,30 @@ __device__ static inline void tensormap_replace_global_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm( - "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_global_t, void* tm_addr, @@ -236,28 +256,31 @@ __device__ static inline void tensormap_replace_global_stride( B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__ord.value), - "l"(__as_b64(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_shared_t, void* tm_addr, @@ -265,29 +288,98 @@ __device__ static inline void tensormap_replace_global_stride( B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm( - "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__ord.value), - "l"(__as_b64(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a // .space = { .global } -template +template = true> +__device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tensormap_replace_element_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a +// .space = { .shared::cta } +template = true> +__device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tensormap_replace_element_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a +// .space = { .global } +template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_global_t, void* tm_addr, @@ -295,28 +387,32 @@ __device__ static inline void tensormap_replace_element_size( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_shared_t, void* tm_addr, @@ -324,27 +420,29 @@ __device__ static inline void tensormap_replace_element_size( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm( - "tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__ord.value), - "r"(__as_b32(__new_val)) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_elemtype( @@ -353,23 +451,26 @@ __device__ static inline void tensormap_replace_elemtype( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_elemtype( @@ -378,23 +479,26 @@ __device__ static inline void tensormap_replace_elemtype( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_interleave_layout( @@ -403,24 +507,28 @@ __device__ static inline void tensormap_replace_interleave_layout( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_interleave_layout( @@ -429,24 +537,28 @@ __device__ static inline void tensormap_replace_interleave_layout( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -455,23 +567,27 @@ __device__ static inline void tensormap_replace_swizzle_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -480,23 +596,27 @@ __device__ static inline void tensormap_replace_swizzle_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_fill_mode( @@ -505,23 +625,26 @@ __device__ static inline void tensormap_replace_fill_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_fill_mode( @@ -530,19 +653,78 @@ __device__ static inline void tensormap_replace_fill_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)), - "n"(__new_val.value) : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 +/* +// tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a +// .space = { .global } +template +__device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_swizzle_atomicity(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_swizzle_atomicity(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h new file mode 100644 index 00000000000..6f5a022dbc8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h @@ -0,0 +1,61 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_barrier_cluster_aligned(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.release.aligned; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.relaxed.aligned; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait.acquire.aligned; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h new file mode 100644 index 00000000000..c5df06bc787 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h @@ -0,0 +1,84 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_clusterlaunchcontrol(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], + // [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_try_cancel));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 + // [addr], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_try_cancel_multicast));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 + // [addr], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_try_cancel_multicast));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_query_cancel_is_canceled));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_y));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_z));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h index a342954591a..de118140440 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h @@ -20,20 +20,30 @@ __global__ void test_cp_async_bulk(void** fn_ptr) NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // - // 1a. unicast + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, - // [rdsmem_bar]; // 2. + // [rdsmem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -44,10 +54,21 @@ __global__ void test_cp_async_bulk(void** fn_ptr) NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. + // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_100, + ( + // cp.async.bulk.global.shared::cta.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_cp_mask));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h index 6e2a986e7bd..81298beb481 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h @@ -21,7 +21,33 @@ __global__ void test_cp_async_bulk_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], - // size, [smem_bar], ctaMask; // 1. + // size, [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], + // size, [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], + // size, [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. + // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -44,18 +105,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1b. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. + // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -67,18 +189,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1c. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. + // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -90,18 +273,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1d. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. + // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -113,18 +357,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1e. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. + // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h new file mode 100644 index 00000000000..930cfa09125 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h @@ -0,0 +1,180 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_bulk_tensor_gather_scatter(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_100, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor_tile_scatter4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor_tile_scatter4));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h index 617bc9507bd..3f3a08764d2 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h @@ -21,7 +21,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 #if __cccl_ptx_isa >= 800 @@ -37,7 +116,20 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 #if __cccl_ptx_isa >= 800 @@ -53,7 +211,33 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h new file mode 100644 index 00000000000..663c07b4121 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_mbarrier_arrive(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET( + NV_PROVIDES_SM_80, + ( + // cp.async.mbarrier.arrive.b64 [addr]; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h new file mode 100644 index 00000000000..a089c727903 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_mbarrier_arrive_noinc(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // cp.async.mbarrier.arrive.noinc.b64 [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_mbarrier_arrive_noinc));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h new file mode 100644 index 00000000000..298225881d1 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_elect_sync(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // elect.sync _|is_elected, membermask; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::elect_sync));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h index aecfcde5e01..0738677ed33 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h @@ -28,10 +28,24 @@ __global__ void test_fence(void** fn_ptr) static_cast(cuda::ptx::fence)); // fence.sc.sys; // 1. * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cta; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 600 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // fence.sc.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 600 + NV_IF_TARGET( + NV_PROVIDES_SM_70, + ( + // fence.acq_rel.cta; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); // fence.acq_rel.gpu; // 1. * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::fence)); @@ -41,14 +55,46 @@ __global__ void test_fence(void** fn_ptr) #endif // __cccl_ptx_isa >= 600 #if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // fence.acq_rel.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 860 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // fence.sc.cluster; // 2. + // fence.acquire.cta; * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cluster; // 2. + static_cast(cuda::ptx::fence)); + // fence.acquire.cluster; * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence));)); -#endif // __cccl_ptx_isa >= 780 + static_cast(cuda::ptx::fence)); + // fence.acquire.gpu; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acquire.sys; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.release.cta; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.release.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.release.gpu; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.release.sys; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h new file mode 100644 index 00000000000..7af3a09ad2b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h @@ -0,0 +1,38 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_fence_proxy_async_generic_sync_restrict(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_proxy_async_generic_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_proxy_async_generic_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h new file mode 100644 index 00000000000..c673d840428 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h @@ -0,0 +1,38 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_fence_sync_restrict(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.acquire.sync_restrict::shared::cluster.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.release.sync_restrict::shared::cta.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h new file mode 100644 index 00000000000..9160be1fe2d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h @@ -0,0 +1,27 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mapa(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mapa.shared::cluster.u32 dest, addr, target_cta; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mapa));)); +#endif // __cccl_ptx_isa >= 780 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h index 3cddcb3b54c..d32773c118d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h @@ -87,4 +87,60 @@ __global__ void test_mbarrier_arrive(void** fn_ptr) cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h index a2ef4b619bb..8ef925662ac 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h @@ -44,4 +44,33 @@ __global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive_expect_tx));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 state, [addr], txCount; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx)); + // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 state, [addr], txCount; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [addr], txCount; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h new file mode 100644 index 00000000000..8dd3b6a2037 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h @@ -0,0 +1,50 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_expect_tx(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [addr], txCount; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx)); + // mbarrier.expect_tx.relaxed.cluster.shared::cta.b64 [addr], txCount; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.expect_tx.relaxed.cta.shared::cluster.b64 [addr], txCount; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx)); + // mbarrier.expect_tx.relaxed.cluster.shared::cluster.b64 [addr], txCount; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h new file mode 100644 index 00000000000..c9c0d0d14fb --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h @@ -0,0 +1,55 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_test_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 700 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait)); + // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.relaxed.cta.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait)); + // mbarrier.test_wait.relaxed.cluster.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h new file mode 100644 index 00000000000..f44c0554308 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h @@ -0,0 +1,55 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 710 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 710 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity)); + // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity)); + // mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h index 00166f8172c..1a1b347751c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h @@ -66,4 +66,35 @@ __global__ void test_mbarrier_try_wait(void** fn_ptr) cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint64_t&, const uint32_t&)>( cuda::ptx::mbarrier_try_wait));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.relaxed.cta.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.relaxed.cluster.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.relaxed.cta.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.relaxed.cluster.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h index 8aa588fbab0..4a5ef3e926f 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h @@ -65,4 +65,36 @@ __global__ void test_mbarrier_try_wait_parity(void** fn_ptr) cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&, const uint32_t&)>( cuda::ptx::mbarrier_try_wait_parity));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 waitComplete, [addr], phaseParity, + // suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h deleted file mode 100644 index 80129e5016c..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h +++ /dev/null @@ -1,24 +0,0 @@ -__global__ void test_mbarrier_test_wait(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 700 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait)); - // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h deleted file mode 100644 index 30902c58905..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h +++ /dev/null @@ -1,24 +0,0 @@ -__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 710 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 710 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity)); - // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h new file mode 100644 index 00000000000..c0259451a1b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h @@ -0,0 +1,1020 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_multimem_ld_reduce(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.s32 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.s64 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.s32 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.s64 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.s32 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.u64 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h new file mode 100644 index 00000000000..dd0011e3fb2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h @@ -0,0 +1,840 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_multimem_red(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h new file mode 100644 index 00000000000..b61c25430ed --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h @@ -0,0 +1,110 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_multimem_st(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // multimem.st.weak.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.st.relaxed.cta.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.cluster.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.gpu.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.sys.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cta.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cluster.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.gpu.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.sys.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // multimem.st.weak.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.st.relaxed.cta.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.cluster.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.gpu.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.sys.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cta.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cluster.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.gpu.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.sys.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h new file mode 100644 index 00000000000..d9203b625e8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_st_bulk(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // st.bulk.weak.shared::cta [addr], size, initval; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::st_bulk));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h new file mode 100644 index 00000000000..48a40f6f23c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h @@ -0,0 +1,81 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_alloc(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc)); + // tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc)); + // tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.dealloc.cta_group::1.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc)); + // tcgen05.dealloc.cta_group::2.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.dealloc.cta_group::1.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc)); + // tcgen05.dealloc.cta_group::2.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit)); + // tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit)); + // tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h new file mode 100644 index 00000000000..c41981e6917 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h @@ -0,0 +1,62 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_commit(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit_multicast)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], + // ctaMask; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_commit_multicast));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit_multicast)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], + // ctaMask; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_commit_multicast));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h new file mode 100644 index 00000000000..4c37cb11cfa --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h @@ -0,0 +1,396 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_cp(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b)); + // tcgen05.cp.cta_group::2.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b)); + // tcgen05.cp.cta_group::2.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b)); + // tcgen05.cp.cta_group::2.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b)); + // tcgen05.cp.cta_group::2.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b)); + // tcgen05.cp.cta_group::2.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b)); + // tcgen05.cp.cta_group::2.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_32x128b_warpx4)); + // tcgen05.cp.cta_group::2.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_32x128b_warpx4)); + // tcgen05.cp.cta_group::2.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h new file mode 100644 index 00000000000..75b2ec35fa5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h @@ -0,0 +1,44 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_fence(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.fence::before_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_before_thread_sync));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.fence::before_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_before_thread_sync));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.fence::after_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_after_thread_sync));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.fence::after_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_after_thread_sync));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h new file mode 100644 index 00000000000..48ecce5869e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h @@ -0,0 +1,1012 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_ld(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h new file mode 100644 index 00000000000..7146c395fa7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h @@ -0,0 +1,2928 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_mma(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h new file mode 100644 index 00000000000..7e1674f39fc --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h @@ -0,0 +1,3570 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_mma_ws(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h new file mode 100644 index 00000000000..293d2787a87 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h @@ -0,0 +1,39 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_shift(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.shift.cta_group::1.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down)); + // tcgen05.shift.cta_group::2.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.shift.cta_group::1.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down)); + // tcgen05.shift.cta_group::2.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h new file mode 100644 index 00000000000..ec8cb758e5d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h @@ -0,0 +1,1012 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_st(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[1])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[1])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[1])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[1])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[2])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[2])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[2])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[2])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[4])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[4])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[4])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[4])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[8])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[8])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[8])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[8])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h new file mode 100644 index 00000000000..424d884049c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h @@ -0,0 +1,40 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.wait::ld.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_ld));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.wait::ld.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_ld));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.wait::st.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_st));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.wait::st.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_st));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h index 95446eb81fa..1439bc84bd0 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h @@ -20,7 +20,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast( cuda::ptx::tensormap_replace_global_address));)); #endif // __cccl_ptx_isa >= 830 @@ -29,7 +41,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast( cuda::ptx::tensormap_replace_global_address));)); #endif // __cccl_ptx_isa >= 830 @@ -38,7 +62,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::tensormap_replace_rank));)); #endif // __cccl_ptx_isa >= 830 @@ -47,7 +83,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::tensormap_replace_rank));)); #endif // __cccl_ptx_isa >= 830 @@ -56,7 +104,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_box_dim));)); @@ -66,7 +128,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_box_dim));)); @@ -76,7 +152,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_global_dim));)); @@ -86,7 +176,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_global_dim));)); @@ -96,7 +200,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int64_t)>( cuda::ptx::tensormap_replace_global_stride));)); @@ -106,17 +224,93 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int64_t)>( cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); #endif // __cccl_ptx_isa >= 830 #if __cccl_ptx_isa >= 830 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_element_size));)); @@ -126,7 +320,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_element_size));)); @@ -136,7 +344,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_elemtype));)); @@ -146,7 +368,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_elemtype));)); @@ -160,6 +396,20 @@ __global__ void test_tensormap_replace(void** fn_ptr) * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); #endif // __cccl_ptx_isa >= 830 #if __cccl_ptx_isa >= 830 @@ -170,13 +420,41 @@ __global__ void test_tensormap_replace(void** fn_ptr) * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); #endif // __cccl_ptx_isa >= 830 #if __cccl_ptx_isa >= 830 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_swizzle_mode));)); @@ -186,7 +464,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_swizzle_mode));)); @@ -196,7 +488,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_fill_mode));)); @@ -206,9 +512,57 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_fill_mode));)); #endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h b/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h new file mode 100644 index 00000000000..fef34f25ef4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// NVRTC ships a built-in copy of , so including CCCL's version of this header will omit the +// content since the header guards are already defined. To make older NVRTC versions have a few newer feature macros +// required for the PTX tests, we define them here outside the header guards. +// TODO(bgruber): limit this workaround to NVRTC versions older than the first one shipping those macros +#ifdef __CUDACC_RTC__ +# ifndef NV_HAS_FEATURE_SM_100a +# define NV_HAS_FEATURE_SM_100a __NV_HAS_FEATURE_SM_100a +# if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && defined(__CUDA_ARCH_FEAT_SM100_ALL)) +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_100a 1 +# else +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_100a 0 +# endif +# endif // NV_HAS_FEATURE_SM_100a + +// Re-enable sm_101a support in nvcc. +# ifndef NV_HAS_FEATURE_SM_101a +# define NV_HAS_FEATURE_SM_101a __NV_HAS_FEATURE_SM_101a +# if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1010) && defined(__CUDA_ARCH_FEAT_SM101_ALL)) +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_101a 1 +# else +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_101a 0 +# endif +# endif // NV_HAS_FEATURE_SM_101a +#endif // __CUDACC_RTC__ diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp index 33d08621ef4..003d8f97017 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/barrier_cluster.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp index e7ff21c2730..1bf931109ed 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_commit_group.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp index fdd35749cc6..be56b1b922c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp index ae1546828ae..226dbe5cf47 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp @@ -16,6 +16,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_multicast.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp index eeb7b4bf5a5..42bc5b8e355 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_tensor.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp index d07351a2275..65172d72897 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp @@ -16,6 +16,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_tensor_multicast.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp index 87910d04941..b31a9fb6a81 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_wait_group.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp index 8b916d74bf9..76a9357ae2f 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_reduce_async_bulk.h" #ifdef _LIBCUDACXX_HAS_NVF16 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp index f6a6fd61735..289f3dd9411 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_reduce_async_bulk_tensor.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp index 56f54b345f7..c439720b8f8 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/fence.h" #include "generated/fence_mbarrier_init.h" #include "generated/fence_proxy_alias.h" diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp index 91a6dd94bf1..adf6bb3e769 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp @@ -15,6 +15,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/get_sreg.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp index ed39816b7d6..9935b0563d2 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/getctarank.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp index 93263910906..a0948e86b18 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/mbarrier_arrive.h" #include "generated/mbarrier_arrive_expect_tx.h" #include "generated/mbarrier_arrive_no_complete.h" diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp index 7af0db56b70..0583b4f6e29 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/mbarrier_init.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp index 896abb8a7d8..732db4f16a1 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp @@ -14,10 +14,12 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header +#include "generated/mbarrier_test_wait.h" +#include "generated/mbarrier_test_wait_parity.h" #include "generated/mbarrier_try_wait.h" #include "generated/mbarrier_try_wait_parity.h" -#include "generated/mbarrier_wait.h" -#include "generated/mbarrier_wait_parity.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp index c6f66503b1f..2993ba3893d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/red_async.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp index 7c008b77126..a833a3770f4 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/st_async.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp index bb5578fc730..5d8566be5b5 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/tensormap_cp_fenceproxy.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp index 264b7956fbb..f0c91aa2296 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/tensormap_replace.h" int main(int, char**) From 671ee2fc626f609bfae3f744ff4fb3d6b7109d47 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Wed, 29 Jan 2025 19:44:57 -0500 Subject: [PATCH 3/4] Update CI matrix to use NVKS nodes. (#3572) * Update CI matrix to use NVKS nodes. * Update windows CI scripts to accept -arch. * Move all non-Catch2 device algo tests to lid0/lid1. This makes sure that they run in the correct CI config on appropriate hardware. * Switch to all rtx queues: CUB -> RTXA6000 (48GiB) Thrust -> RTX4090 (24GiB) Others -> RTX2080 (8GiB) --- ci/matrix.yaml | 73 ++++++++++++++------------------- ci/windows/build_common.psm1 | 15 ++++++- ci/windows/build_cub.ps1 | 8 +++- ci/windows/build_cudax.ps1 | 8 +++- ci/windows/build_libcudacxx.ps1 | 8 +++- ci/windows/build_thrust.ps1 | 8 +++- ci/windows/test_thrust.ps1 | 8 +++- cub/test/CMakeLists.txt | 9 ++++ 8 files changed, 83 insertions(+), 54 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index c3f03d323ab..5ec715fb59b 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -19,49 +19,51 @@ workflows: - {jobs: ['build'], std: 'max', cxx: ['msvc2019']} - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']} # Current CTK testing: - - {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']} + - {jobs: ['test'], project: ['thrust'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx4090'} + - {jobs: ['test'], project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080'} # Disabled until we figure out the issue with the TBB dll - #- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']} + #- {jobs: ['test'], project: ['thrust'], std: 'max', cxx: ['msvc'], gpu: 'rtx4090'} + - {jobs: ['test'], project: ['libcudacxx'], std: 'max', cxx: ['msvc'], gpu: 'rtx2080'} # Split up cub tests: - - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']} - - {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']} - - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc']} - - {jobs: ['test_lid0'], project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100', sm: 'gpu' } + - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc'], gpu: 'rtxa6000'} + - {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'], gpu: 'rtxa6000'} + - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 'rtxa6000'} + - {jobs: ['test_lid0'], project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100', sm: 'gpu' } # Modded builds: - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'} - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'} - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'} # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly. - - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'} # default_projects: clang-cuda - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'} - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'} - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'} # nvrtc: - - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all'} + - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all', gpu: 'rtx2080', sm: 'gpu'} # verify-codegen: - {jobs: ['verify_codegen'], project: 'libcudacxx'} # cudax has different CTK reqs: - - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20, cxx: ['msvc14.36']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc10', 'gcc11', 'gcc12']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['clang14', 'clang15', 'clang16', 'clang17']} + - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20, cxx: ['msvc14.36']} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc10', 'gcc11', 'gcc12']} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: 'all', cxx: ['nvhpc']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['msvc2022']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17, cxx: ['gcc'], sm: "90"} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc'], sm: "90a"} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['msvc2022']} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17, cxx: ['gcc'], sm: "90"} + - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc'], sm: "90a"} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'} - - {jobs: ['test'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc12', 'clang', 'msvc']} + - {jobs: ['test'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc12', 'clang', 'msvc'], gpu: 'rtx2080'} # Python and c/parallel jobs: - - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'} + - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6', gpu: 'rtx2080'} # cccl-infra: - - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']} - - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']} + - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'} + - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang'], gpu: 'rtx2080'} nightly: # Edge-case jobs - - {jobs: ['limited'], project: 'cub', std: 17} - - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} - - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'} + - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'rtx4090'} # Old CTK/compiler - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']} - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'} @@ -70,7 +72,11 @@ workflows: - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], std: 'all', cxx: ['msvc2019']} # Test current CTK - - {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']} + - {jobs: ['test'], project: 'cub', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'} + - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'v100'} + - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'h100', sm: 'gpu' } + - {jobs: ['test'], project: 'thrust', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'} + - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'} # Modded builds: - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'} - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'} @@ -88,26 +94,9 @@ workflows: - {jobs: ['build'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['gcc12'], sm: "90"} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13'], sm: "90a"} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'} - - {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']} - - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14']} - - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18']} - -# # These are waiting on the NVKS nodes: -# - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc7', std: [11]} -# - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang14', std: [17]} -# - {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17]} -# - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14]} -# - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all'} -# - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang14', std: [11]} -# # H100 runners are currently flakey, only build since those use CPU-only runners: -# - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]} -# - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang18', std: [17]} -# -# # nvrtc: -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']} -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13', std: [20], project: ['libcudacxx']} -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all', project: ['libcudacxx']} -# - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc13', std: [11, 20], project: ['libcudacxx']} + - {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12'] , gpu: 'rtx2080'} + - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080'} + - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18'], gpu: 'rtx2080'} # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows. exclude: diff --git a/ci/windows/build_common.psm1 b/ci/windows/build_common.psm1 index 1eb5f1a9d63..151bb1f112e 100644 --- a/ci/windows/build_common.psm1 +++ b/ci/windows/build_common.psm1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -20,6 +24,12 @@ if ($script:CL_VERSION_STRING -match "Version (\d+\.\d+)\.\d+") { Write-Host "Detected cl.exe version: $CL_VERSION" } +$script:GLOBAL_CMAKE_OPTIONS = "" +if ($CUDA_ARCH -ne 0) { + $script:GLOBAL_CMAKE_OPTIONS += "-DCMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH" +} + + if (-not $env:CCCL_BUILD_INFIX) { $env:CCCL_BUILD_INFIX = "" } @@ -56,6 +66,7 @@ Write-Host "NVCC_VERSION=$NVCC_VERSION" Write-Host "CMAKE_BUILD_PARALLEL_LEVEL=$env:CMAKE_BUILD_PARALLEL_LEVEL" Write-Host "CTEST_PARALLEL_LEVEL=$env:CTEST_PARALLEL_LEVEL" Write-Host "CCCL_BUILD_INFIX=$env:CCCL_BUILD_INFIX" +Write-Host "GLOBAL_CMAKE_OPTIONS=$script:GLOBAL_CMAKE_OPTIONS" Write-Host "Current commit is:" Write-Host "$(git log -1 --format=short)" Write-Host "========================================" @@ -82,7 +93,7 @@ function configure_preset { pushd ".." # Echo and execute command to stdout: - $configure_command = "cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE" + $configure_command = "cmake --preset $PRESET $script:GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS --log-level VERBOSE" Write-Host $configure_command Invoke-Expression $configure_command $test_result = $LastExitCode diff --git a/ci/windows/build_cub.ps1 b/ci/windows/build_cub.ps1 index 32e4f71ee9a..27c5360ded9 100644 --- a/ci/windows/build_cub.ps1 +++ b/ci/windows/build_cub.ps1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { pushd "$PSScriptRoot/.." } -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "cub-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" diff --git a/ci/windows/build_cudax.ps1 b/ci/windows/build_cudax.ps1 index ca7bd578291..7b8cd0ff771 100644 --- a/ci/windows/build_cudax.ps1 +++ b/ci/windows/build_cudax.ps1 @@ -4,7 +4,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(20)] - [int]$CXX_STANDARD = 20 + [int]$CXX_STANDARD = 20, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $CURRENT_PATH = Split-Path $pwd -leaf @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { } Remove-Module -Name build_common -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "cudax-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" diff --git a/ci/windows/build_libcudacxx.ps1 b/ci/windows/build_libcudacxx.ps1 index a57e2280de7..2f80619f76b 100644 --- a/ci/windows/build_libcudacxx.ps1 +++ b/ci/windows/build_libcudacxx.ps1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { pushd "$PSScriptRoot/.." } -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $GPU_ARCHS +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "libcudacxx-cpp${CXX_STANDARD}" $CMAKE_OPTIONS = "" diff --git a/ci/windows/build_thrust.ps1 b/ci/windows/build_thrust.ps1 index 186ed94eace..bda86859fd4 100644 --- a/ci/windows/build_thrust.ps1 +++ b/ci/windows/build_thrust.ps1 @@ -3,7 +3,11 @@ Param( [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] - [int]$CXX_STANDARD = 17 + [int]$CXX_STANDARD = 17, + [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0 ) $ErrorActionPreference = "Stop" @@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") { pushd "$PSScriptRoot/.." } -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "thrust-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" diff --git a/ci/windows/test_thrust.ps1 b/ci/windows/test_thrust.ps1 index 7c020714208..eabda06df5b 100644 --- a/ci/windows/test_thrust.ps1 +++ b/ci/windows/test_thrust.ps1 @@ -5,6 +5,10 @@ Param( [ValidateSet(11, 14, 17, 20)] [int]$CXX_STANDARD = 17, [Parameter(Mandatory = $false)] + [ValidateNotNullOrEmpty()] + [Alias("arch")] + [int]$CUDA_ARCH = 0, + [Parameter(Mandatory = $false)] [Alias("cpu-only")] [switch]$CPU_ONLY = $false ) @@ -24,11 +28,11 @@ If($CURRENT_PATH -ne "ci") { } # Execute the build script: -$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD" +$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD -arch $CUDA_ARCH" Write-Host "Executing: $build_command" Invoke-Expression $build_command -Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD +Import-Module -Name "$PSScriptRoot/build_common.psm1" -ArgumentList $CXX_STANDARD, $CUDA_ARCH $PRESET = "thrust-cpu-cpp$CXX_STANDARD" diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 5a093526edd..aaab1984e21 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -370,6 +370,15 @@ foreach (test_src IN LISTS test_srcs) set(launcher 0) endif() + # FIXME: There are a few remaining device algorithm tests that have not been ported to + # use Catch2 and lid variants. Mark these as `lid_0/1` so they'll run in the appropriate + # CI configs: + string(REGEX MATCH "^device_" is_device_test "${test_name}") + _cub_is_fail_test(is_fail_test "%{test_name}") + if (is_device_test AND NOT is_fail_test) + string(APPEND test_name ".lid_${launcher}") + endif() + # Only one version of this test. cub_add_test(test_target ${test_name} "${test_src}" ${cub_target} ${launcher}) cub_configure_cuda_target(${test_target} RDC ${CUB_FORCE_RDC}) From 0c17dbd005a934ffe2f83cf0b73a6a9aa5383852 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Wed, 29 Jan 2025 17:28:19 -0800 Subject: [PATCH 4/4] Deprecate and replace `CUB_IS_INT128_ENABLED` (#3427) Co-authored-by: Bernhard Manfred Gruber --- cub/cub/detail/fast_modulo_division.cuh | 6 +++--- .../device/dispatch/dispatch_histogram.cuh | 12 +++++------ .../tuning/tuning_run_length_encode.cuh | 8 ++++---- .../device/dispatch/tuning/tuning_scan.cuh | 4 ++-- .../dispatch/tuning/tuning_scan_by_key.cuh | 20 +++++++++---------- .../dispatch/tuning/tuning_select_if.cuh | 16 +++++++-------- cub/cub/util_ptx.cuh | 2 +- cub/cub/util_type.cuh | 13 ++---------- .../catch2_test_device_for_each_in_extents.cu | 4 ++-- cub/test/catch2_test_printing.cu | 2 +- cub/test/internal/catch2_test_fast_div_mod.cu | 2 +- cub/test/test_util.h | 2 +- 12 files changed, 41 insertions(+), 50 deletions(-) diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh index 4a5f2048e32..09068d87be0 100644 --- a/cub/cub/detail/fast_modulo_division.cuh +++ b/cub/cub/detail/fast_modulo_division.cuh @@ -38,7 +38,7 @@ #endif // no system header #include // implicit_prom_t -#include // CUB_IS_INT128_ENABLED +#include // _CCCL_HAS_INT128() #include // cuda::std::ceil_div #include // std::has_single_bit @@ -79,7 +79,7 @@ struct larger_unsigned_type using type = ::cuda::std::uint64_t; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct larger_unsigned_type::type> @@ -87,7 +87,7 @@ struct larger_unsigned_type using type = __uint128_t; }; -#endif // CUB_IS_INT128_ENABLED +#endif // _CCCL_HAS_INT128() template using larger_unsigned_type_t = typename larger_unsigned_type::type; diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 2ac4e160220..2c2d0a2a9ca 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -646,27 +646,27 @@ public: using IntArithmeticT = ::cuda::std::_If< // sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), // uint32_t, // -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() ::cuda::std::_If< // (::cuda::std::is_same::value || // ::cuda::std::is_same::value), // CommonT, // uint64_t> // -#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv +#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv uint64_t -#endif // !CUB_IS_INT128_ENABLED +#endif // !_CCCL_HAS_INT128() >; // Alias template that excludes __[u]int128 from the integral types template using is_integral_excl_int128 = -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() ::cuda::std::_If<::cuda::std::is_same::value&& ::cuda::std::is_same::value, ::cuda::std::false_type, ::cuda::std::is_integral>; -#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv +#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv ::cuda::std::is_integral; -#endif // !CUB_IS_INT128_ENABLED +#endif // !_CCCL_HAS_INT128() union ScaleT { diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index d938209dcf2..12f07f3f366 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -156,7 +156,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -216,7 +216,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -349,7 +349,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -414,7 +414,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 7b076507341..165a17cae52 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -175,7 +175,7 @@ struct sm80_tuning struct sm80_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> { @@ -221,7 +221,7 @@ template struct sm90_tuning struct sm90_tuning : sm90_tuning_vals {}; template <> struct sm90_tuning : sm90_tuning_vals {}; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : sm90_tuning_vals<__int128_t, 576, 21, 860, 630> {}; template <> struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index f8e29201eea..2bc31ef6697 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -172,7 +172,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -229,7 +229,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -286,7 +286,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -343,7 +343,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -400,7 +400,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm80_tuning { @@ -465,7 +465,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -522,7 +522,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -579,7 +579,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -636,7 +636,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { @@ -693,7 +693,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template struct sm90_tuning { diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 10d22286068..c1b74b4ae09 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -121,7 +121,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -174,7 +174,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -227,7 +227,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -280,7 +280,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -336,7 +336,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -389,7 +389,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -442,7 +442,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -495,7 +495,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index 99beeed313e..e6bb45c4a31 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -99,7 +99,7 @@ BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type return (source >> bit_start) & MASK; } -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() /** * Bitfield-extract for 128-bit types. */ diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index 4d1db99a821..a89cd159309 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -76,17 +76,8 @@ _CCCL_DIAG_POP CUB_NAMESPACE_BEGIN #ifndef CUB_IS_INT128_ENABLED -# if defined(__CUDACC_RTC__) -# if defined(__CUDACC_RTC_INT128__) -# define CUB_IS_INT128_ENABLED 1 -# endif // !defined(__CUDACC_RTC_INT128__) -# else // !defined(__CUDACC_RTC__) -# if _CCCL_CUDACC_AT_LEAST(11, 5) -# if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) -# define CUB_IS_INT128_ENABLED 1 -# endif // GCC || CLANG || NVHPC -# endif // _CCCL_CUDACC_AT_LEAST(11, 5) -# endif // !defined(__CUDACC_RTC__) +// Deprecated [Since 2.8] +# define CUB_IS_INT128_ENABLED _CCCL_HAS_INT128() #endif // !defined(CUB_IS_INT128_ENABLED) /****************************************************************************** diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu index 3e5a6c6689a..313b9e58b38 100644 --- a/cub/test/catch2_test_device_for_each_in_extents.cu +++ b/cub/test/catch2_test_device_for_each_in_extents.cu @@ -107,7 +107,7 @@ using index_types = uint16_t, int32_t, uint32_t -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() , int64_t, uint64_t @@ -120,7 +120,7 @@ using index_types_dynamic = uint16_t, int32_t, uint32_t -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() , int64_t, uint64_t diff --git a/cub/test/catch2_test_printing.cu b/cub/test/catch2_test_printing.cu index 6f93515114a..63b622f3554 100644 --- a/cub/test/catch2_test_printing.cu +++ b/cub/test/catch2_test_printing.cu @@ -11,7 +11,7 @@ std::string print(T val) return ss.str(); } -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() TEST_CASE("Test utils can print __int128", "[test][utils]") { REQUIRE(print(__int128_t{0}) == "0"); diff --git a/cub/test/internal/catch2_test_fast_div_mod.cu b/cub/test/internal/catch2_test_fast_div_mod.cu index 8a1a3e96a27..ec3b5e20d68 100644 --- a/cub/test/internal/catch2_test_fast_div_mod.cu +++ b/cub/test/internal/catch2_test_fast_div_mod.cu @@ -42,7 +42,7 @@ using index_types = uint16_t, int32_t, uint32_t -# if CUB_IS_INT128_ENABLED +# if _CCCL_HAS_INT128() , int64_t, uint64_t diff --git a/cub/test/test_util.h b/cub/test/test_util.h index 031298120dc..9a5fefcc69c 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -717,7 +717,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< return os; } -#if CUB_IS_INT128_ENABLED +#if _CCCL_HAS_INT128() inline std::ostream& operator<<(std::ostream& os, __uint128_t val) { constexpr int max_digits = 40;