From b8e1549eead0a8d33bd4259cf49be2d256e3d105 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Tue, 28 Jan 2025 14:52:28 -0500
Subject: [PATCH 1/9] Update CI matrix to use NVKS nodes.

General allocation strategy is:

- Primary CUB testing continues to use v100 (32GiB). This is because CUB tests often require very large amounts of gmem.
- Other CUB builds use t4 (16GiB). These should have enough memory to run most tests.
- Thrust testing uses t4 (16GiB). Some tests may require >8GiB, but not as much as CUB requires.
- libcudacxx/cudax/python testing uses rtx2080 (8GiB), as these are not as memory intensive as Thrust/CUB.

None of the NVKS queue require the testing tag anymore, so this has been removed as well.
---
 ci/matrix.yaml | 85 +++++++++++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 49 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 6a98e8fc5b0..c9632abf87c 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -19,49 +19,50 @@ workflows:
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
-    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']}
+    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 't4',      sm: 'gpu'}
+    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080', sm: 'gpu'}
     # Disabled until we figure out the issue with the TBB dll
-    #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']}
+    #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc'], gpu: 't4', sm: 'gpu'}
     # Split up cub tests:
-    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']}
-    - {jobs: ['test_lid1',  'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
-    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc']}
-    - {jobs: ['test_lid0'],               project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100',  sm: 'gpu' }
+    - {jobs: ['test_lid0'],                            project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'v100', sm: 'gpu'}
+    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 't4',   sm: 'gpu'}
+    - {jobs: ['test_nolid', 'test_lid0'],              project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 't4',   sm: 'gpu'}
+    - {jobs: ['test_lid0'],                            project: ['cub'], std: 'max', cxx: 'gcc12',           gpu: 'h100', sm: 'gpu' }
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'}
     # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly.
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4', sm: 'gpu'}
     # default_projects: clang-cuda
     - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'}
     - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'}
     - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
     # nvrtc:
-    - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all'}
+    - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all', gpu: 'rtx2080', sm: 'gpu'}
     # verify-codegen:
     - {jobs: ['verify_codegen'], project: 'libcudacxx'}
     # cudax has different CTK reqs:
-    - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20,       cxx: ['msvc14.36']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
+    - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20,    cxx: ['msvc14.36']}
+    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc10', 'gcc11', 'gcc12']}
+    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: 'all', cxx: ['nvhpc']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['msvc2022']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17,       cxx: ['gcc'], sm: "90"}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc'], sm: "90a"}
+    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['msvc2022']}
+    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17,    cxx: ['gcc'], sm: "90"}
+    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc'], sm: "90a"}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc12', 'clang', 'msvc']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc12', 'clang', 'msvc'], gpu: 'rtx2080', sm: 'gpu'}
     # Python and c/parallel jobs:
-    - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'}
+    - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6', gpu: 'rtx2080', sm: 'gpu'}
     # cccl-infra:
-    - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']}
-    - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang']}
+    - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang'],   gpu: 'rtx2080', sm: 'gpu'}
 
   nightly:
     # Edge-case jobs
-    - {jobs: ['limited'], project: 'cub', std: 17}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'}
+    - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4', sm: 'gpu'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 't4', sm: 'gpu'}
     # Old CTK/compiler
     - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']}
     - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'}
@@ -70,7 +71,10 @@ workflows:
     - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'all', cxx: ['msvc2019']}
     # Test current CTK
-    - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
+    - {jobs: ['test_lid0'],                             project: 'cub',        std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100',    sm: 'gpu'}
+    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',        std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
+    - {jobs: ['test'],                                  project: 'thrust',     std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
+    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'rtx2080', sm: 'gpu'}
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
@@ -88,26 +92,9 @@ workflows:
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['gcc12'], sm: "90"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13'], sm: "90a"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14']}
-    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18']}
-
-#  # These are waiting on the NVKS nodes:
-#    - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc7',    std: [11]}
-#    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang14',  std: [17]}
-#    - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',   std: [17]}
-#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',    std: [14]}
-#    - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',   std: 'all'}
-#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang14',  std: [11]}
-#    # H100 runners are currently flakey, only build since those use CPU-only runners:
-#    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',   std: [11, 20]}
-#    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang18', std: [17]}
-#
-#   # nvrtc:
-#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4',       sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
-#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
-#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',  std: 'all',    project: ['libcudacxx']}
-#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc13',  std: [11, 20], project: ['libcudacxx']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']  , gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18'], gpu: 'rtx2080', sm: 'gpu'}
 
   # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
   exclude:
@@ -257,13 +244,13 @@ projects:
 
 # testing -> Runner with GPU is in a nv-gh-runners testing pool
 gpus:
-  v100:     { sm: 70 }                # 32 GB,  40 runners
-  t4:       { sm: 75, testing: true } # 16 GB,   8 runners
-  rtx2080:  { sm: 75, testing: true } #  8 GB,   8 runners
-  rtxa6000: { sm: 86, testing: true } # 48 GB,  12 runners
-  l4:       { sm: 89, testing: true } # 24 GB,  48 runners
-  rtx4090:  { sm: 89, testing: true } # 24 GB,  10 runners
-  h100:     { sm: 90, testing: true } # 80 GB,  16 runners
+  v100:     { sm: 70 } # 32 GB,  40 runners
+  t4:       { sm: 75 } # 16 GB,  10 runners
+  rtx2080:  { sm: 75 } #  8 GB,  12 runners
+  rtxa6000: { sm: 86 } # 48 GB,  12 runners
+  l4:       { sm: 89 } # 24 GB,  48 runners
+  rtx4090:  { sm: 89 } # 24 GB,  10 runners
+  h100:     { sm: 90 } # 80 GB,  16 runners
 
 # Tags are used to define a `matrix job` in the workflow section.
 #

From 50864b97c6315845c4eb4ae5039111da93b7b4ff Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Tue, 28 Jan 2025 16:31:23 -0500
Subject: [PATCH 2/9] Update windows CI scripts to accept -arch.

---
 ci/windows/build_common.psm1    | 15 +++++++++++++--
 ci/windows/build_cub.ps1        |  8 ++++++--
 ci/windows/build_cudax.ps1      |  8 ++++++--
 ci/windows/build_libcudacxx.ps1 |  8 ++++++--
 ci/windows/build_thrust.ps1     |  8 ++++++--
 ci/windows/test_thrust.ps1      |  8 ++++++--
 6 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/ci/windows/build_common.psm1 b/ci/windows/build_common.psm1
index 1eb5f1a9d63..151bb1f112e 100644
--- a/ci/windows/build_common.psm1
+++ b/ci/windows/build_common.psm1
@@ -3,7 +3,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(11, 14, 17, 20)]
-    [int]$CXX_STANDARD = 17
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $ErrorActionPreference = "Stop"
@@ -20,6 +24,12 @@ if ($script:CL_VERSION_STRING -match "Version (\d+\.\d+)\.\d+") {
     Write-Host "Detected cl.exe version: $CL_VERSION"
 }
 
+$script:GLOBAL_CMAKE_OPTIONS = ""
+if ($CUDA_ARCH -ne 0) {
+    $script:GLOBAL_CMAKE_OPTIONS += "-DCMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH"
+}
+
+
 if (-not $env:CCCL_BUILD_INFIX) {
     $env:CCCL_BUILD_INFIX = ""
 }
@@ -56,6 +66,7 @@ Write-Host "NVCC_VERSION=$NVCC_VERSION"
 Write-Host "CMAKE_BUILD_PARALLEL_LEVEL=$env:CMAKE_BUILD_PARALLEL_LEVEL"
 Write-Host "CTEST_PARALLEL_LEVEL=$env:CTEST_PARALLEL_LEVEL"
 Write-Host "CCCL_BUILD_INFIX=$env:CCCL_BUILD_INFIX"
+Write-Host "GLOBAL_CMAKE_OPTIONS=$script:GLOBAL_CMAKE_OPTIONS"
 Write-Host "Current commit is:"
 Write-Host "$(git log -1 --format=short)"
 Write-Host "========================================"
@@ -82,7 +93,7 @@ function configure_preset {
     pushd ".."
 
     # Echo and execute command to stdout:
-    $configure_command = "cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE"
+    $configure_command = "cmake --preset $PRESET $script:GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS --log-level VERBOSE"
     Write-Host $configure_command
     Invoke-Expression $configure_command
     $test_result = $LastExitCode
diff --git a/ci/windows/build_cub.ps1 b/ci/windows/build_cub.ps1
index 32e4f71ee9a..27c5360ded9 100644
--- a/ci/windows/build_cub.ps1
+++ b/ci/windows/build_cub.ps1
@@ -3,7 +3,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(11, 14, 17, 20)]
-    [int]$CXX_STANDARD = 17
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $ErrorActionPreference = "Stop"
@@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") {
     pushd "$PSScriptRoot/.."
 }
 
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "cub-cpp$CXX_STANDARD"
 $CMAKE_OPTIONS = ""
diff --git a/ci/windows/build_cudax.ps1 b/ci/windows/build_cudax.ps1
index ca7bd578291..7b8cd0ff771 100644
--- a/ci/windows/build_cudax.ps1
+++ b/ci/windows/build_cudax.ps1
@@ -4,7 +4,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(20)]
-    [int]$CXX_STANDARD = 20
+    [int]$CXX_STANDARD = 20,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $CURRENT_PATH = Split-Path $pwd -leaf
@@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") {
 }
 
 Remove-Module -Name build_common
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "cudax-cpp$CXX_STANDARD"
 $CMAKE_OPTIONS = ""
diff --git a/ci/windows/build_libcudacxx.ps1 b/ci/windows/build_libcudacxx.ps1
index a57e2280de7..2f80619f76b 100644
--- a/ci/windows/build_libcudacxx.ps1
+++ b/ci/windows/build_libcudacxx.ps1
@@ -3,7 +3,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(11, 14, 17, 20)]
-    [int]$CXX_STANDARD = 17
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $ErrorActionPreference = "Stop"
@@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") {
     pushd "$PSScriptRoot/.."
 }
 
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $GPU_ARCHS
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "libcudacxx-cpp${CXX_STANDARD}"
 $CMAKE_OPTIONS = ""
diff --git a/ci/windows/build_thrust.ps1 b/ci/windows/build_thrust.ps1
index 186ed94eace..bda86859fd4 100644
--- a/ci/windows/build_thrust.ps1
+++ b/ci/windows/build_thrust.ps1
@@ -3,7 +3,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(11, 14, 17, 20)]
-    [int]$CXX_STANDARD = 17
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $ErrorActionPreference = "Stop"
@@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") {
     pushd "$PSScriptRoot/.."
 }
 
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "thrust-cpp$CXX_STANDARD"
 $CMAKE_OPTIONS = ""
diff --git a/ci/windows/test_thrust.ps1 b/ci/windows/test_thrust.ps1
index 7c020714208..eabda06df5b 100644
--- a/ci/windows/test_thrust.ps1
+++ b/ci/windows/test_thrust.ps1
@@ -5,6 +5,10 @@ Param(
     [ValidateSet(11, 14, 17, 20)]
     [int]$CXX_STANDARD = 17,
     [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0,
+    [Parameter(Mandatory = $false)]
     [Alias("cpu-only")]
     [switch]$CPU_ONLY = $false
 )
@@ -24,11 +28,11 @@ If($CURRENT_PATH -ne "ci") {
 }
 
 # Execute the build script:
-$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD"
+$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD -arch $CUDA_ARCH"
 Write-Host "Executing: $build_command"
 Invoke-Expression $build_command
 
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module -Name "$PSScriptRoot/build_common.psm1" -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "thrust-cpu-cpp$CXX_STANDARD"
 

From 571e72997fb003f168bd1c73cfa9e1e31b14892b Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Tue, 28 Jan 2025 19:19:49 -0500
Subject: [PATCH 3/9] Move all non-Catch2 device algo tests to lid0/lid1.

This makes sure that they run in the correct CI config on appropriate hardware.
---
 cub/test/CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index 5a093526edd..aaab1984e21 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -370,6 +370,15 @@ foreach (test_src IN LISTS test_srcs)
         set(launcher 0)
       endif()
 
+      # FIXME: There are a few remaining device algorithm tests that have not been ported to
+      # use Catch2 and lid variants. Mark these as `lid_0/1` so they'll run in the appropriate
+      # CI configs:
+      string(REGEX MATCH "^device_" is_device_test "${test_name}")
+      _cub_is_fail_test(is_fail_test "%{test_name}")
+      if (is_device_test AND NOT is_fail_test)
+        string(APPEND test_name ".lid_${launcher}")
+      endif()
+
       # Only one version of this test.
       cub_add_test(test_target ${test_name} "${test_src}" ${cub_target} ${launcher})
       cub_configure_cuda_target(${test_target} RDC ${CUB_FORCE_RDC})

From 6434ef536dcf2ffedacb1b595602eb34d4dbb493 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Tue, 28 Jan 2025 19:24:18 -0500
Subject: [PATCH 4/9] Move libcudacxx builds to t4 temporarily.

heterogeneous/barrier_abi_v2.pass.cpp is timing out on rtx2080.
---
 ci/matrix.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index c9632abf87c..1b8d206dcf7 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -20,7 +20,9 @@ workflows:
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
     - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 't4',      sm: 'gpu'}
-    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080', sm: 'gpu'}
+    # Switching to t4 temporarily while investigating a bug in heterogeneous/barrier_abi_v2.pass.cpp
+    # - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 't4',      sm: 'gpu'}
     # Disabled until we figure out the issue with the TBB dll
     #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc'], gpu: 't4', sm: 'gpu'}
     # Split up cub tests:
@@ -74,7 +76,9 @@ workflows:
     - {jobs: ['test_lid0'],                             project: 'cub',        std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100',    sm: 'gpu'}
     - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',        std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
     - {jobs: ['test'],                                  project: 'thrust',     std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
-    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'rtx2080', sm: 'gpu'}
+    # Switching to t4 temporarily while investigating a bug in heterogeneous/barrier_abi_v2.pass.cpp
+    # - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}

From bf68a88934338c6c94985d2baa87a64e1f4be0c3 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Wed, 29 Jan 2025 10:58:31 -0500
Subject: [PATCH 5/9] libcudacxx test fail on t4, too. Moving back to v100.

---
 ci/matrix.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 1b8d206dcf7..19a9dcfb27c 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -20,9 +20,9 @@ workflows:
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
     - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 't4',      sm: 'gpu'}
-    # Switching to t4 temporarily while investigating a bug in heterogeneous/barrier_abi_v2.pass.cpp
+    # Switching to v100 temporarily while investigating a timeout in heterogeneous/barrier*.pass.cpp
     # - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 't4',      sm: 'gpu'}
+    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'v100',    sm: 'gpu'}
     # Disabled until we figure out the issue with the TBB dll
     #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc'], gpu: 't4', sm: 'gpu'}
     # Split up cub tests:
@@ -76,9 +76,9 @@ workflows:
     - {jobs: ['test_lid0'],                             project: 'cub',        std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100',    sm: 'gpu'}
     - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',        std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
     - {jobs: ['test'],                                  project: 'thrust',     std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
-    # Switching to t4 temporarily while investigating a bug in heterogeneous/barrier_abi_v2.pass.cpp
+    # Switching to v100 temporarily while investigating a timeout in heterogeneous/barrier*.pass.cpp
     # - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
+    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100',    sm: 'gpu'}
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}

From 2977b2e11cc2e552bcf3e9bea5a08ecd4b91e4ab Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Wed, 29 Jan 2025 12:03:39 -0500
Subject: [PATCH 6/9] TEMP override matrix to experiment with libcu++ failures.

---
 ci/matrix.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 19a9dcfb27c..c4484d3d58a 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,6 +8,12 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
+    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 't4',      sm: 'gpu'}
+    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 'v100',    sm: 'gpu'}
+    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 't4'}
+    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 'v100'}
 
   pull_request:
     # Old CTK/compiler

From 9f70b22fb4c9dde3cbfee5a8ffd8de07909f79c4 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Wed, 29 Jan 2025 12:54:02 -0500
Subject: [PATCH 7/9] Remove `sm: 'gpu'` from most CI jobs.

This will allow us to reuse more build artifacts, and works around some issues with libcudacxx (#3590).
---
 ci/matrix.yaml | 49 +++++++++++++++++++++----------------------------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index c4484d3d58a..800f1b900ec 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,12 +8,6 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
-    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 't4',      sm: 'gpu'}
-    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 'v100',    sm: 'gpu'}
-    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 'rtx2080'}
-    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 't4'}
-    - {jobs: ['test'], project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18'], gpu: 'v100'}
 
   pull_request:
     # Old CTK/compiler
@@ -25,23 +19,21 @@ workflows:
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
-    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 't4',      sm: 'gpu'}
-    # Switching to v100 temporarily while investigating a timeout in heterogeneous/barrier*.pass.cpp
-    # - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'v100',    sm: 'gpu'}
+    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 't4'}
+    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080'}
     # Disabled until we figure out the issue with the TBB dll
     #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc'], gpu: 't4', sm: 'gpu'}
     # Split up cub tests:
-    - {jobs: ['test_lid0'],                            project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'v100', sm: 'gpu'}
-    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 't4',   sm: 'gpu'}
-    - {jobs: ['test_nolid', 'test_lid0'],              project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 't4',   sm: 'gpu'}
+    - {jobs: ['test_lid0'],                            project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'v100'}
+    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 't4'}
+    - {jobs: ['test_nolid', 'test_lid0'],              project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 't4'}
     - {jobs: ['test_lid0'],                            project: ['cub'], std: 'max', cxx: 'gcc12',           gpu: 'h100', sm: 'gpu' }
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'}
     # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly.
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4', sm: 'gpu'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4'}
     # default_projects: clang-cuda
     - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'}
     - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'}
@@ -59,18 +51,18 @@ workflows:
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17,    cxx: ['gcc'], sm: "90"}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc'], sm: "90a"}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc12', 'clang', 'msvc'], gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc12', 'clang', 'msvc'], gpu: 'rtx2080'}
     # Python and c/parallel jobs:
-    - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6', gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6', gpu: 'rtx2080'}
     # cccl-infra:
-    - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang'],   gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'}
+    - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
 
   nightly:
     # Edge-case jobs
-    - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4', sm: 'gpu'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 't4', sm: 'gpu'}
+    - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 't4'}
     # Old CTK/compiler
     - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']}
     - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'}
@@ -79,12 +71,13 @@ workflows:
     - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'all', cxx: ['msvc2019']}
     # Test current CTK
-    - {jobs: ['test_lid0'],                             project: 'cub',        std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100',    sm: 'gpu'}
-    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',        std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
-    - {jobs: ['test'],                                  project: 'thrust',     std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4',      sm: 'gpu'}
+    - {jobs: ['test_lid0'],                             project: 'cub',    std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100'}
+    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',    std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4'}
+    - {jobs: ['test_lid0'],                             project: 'cub',    std: 'max', cxx: 'gcc12',                          gpu: 'h100', sm: 'gpu' }
+    - {jobs: ['test'],                                  project: 'thrust', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4'}
     # Switching to v100 temporarily while investigating a timeout in heterogeneous/barrier*.pass.cpp
     # - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100',    sm: 'gpu'}
+    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100'}
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
@@ -102,9 +95,9 @@ workflows:
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['gcc12'], sm: "90"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13'], sm: "90a"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']  , gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18'], gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']  , gpu: 'rtx2080'}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080'}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18'], gpu: 'rtx2080'}
 
   # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
   exclude:

From ee229964288cefaf5b8269cf08c0e1c97e2e07fd Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Wed, 29 Jan 2025 14:32:26 -0500
Subject: [PATCH 8/9] Move t4 jobs to v100 while runners team investigates
 issues.

---
 ci/matrix.yaml | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 800f1b900ec..77bb9785649 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -19,21 +19,29 @@ workflows:
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
-    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 't4'}
+ # Moving this job to v100 while runners team investigates issues on t4 pool.
+ #   - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 't4'}
+    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 'v100'}
     - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080'}
     # Disabled until we figure out the issue with the TBB dll
-    #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc'], gpu: 't4', sm: 'gpu'}
+   #- {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['msvc'], gpu: 't4'}
+    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['msvc'], gpu: 'rtx2080'}
     # Split up cub tests:
     - {jobs: ['test_lid0'],                            project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'v100'}
-    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 't4'}
-    - {jobs: ['test_nolid', 'test_lid0'],              project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 't4'}
+# Moving these jobs to v100 while runners team investigates issues on t4 pool.
+#    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 't4'}
+#    - {jobs: ['test_nolid', 'test_lid0'],              project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 't4'}
+    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'v100'}
+    - {jobs: ['test_nolid', 'test_lid0'],              project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 'v100'}
     - {jobs: ['test_lid0'],                            project: ['cub'], std: 'max', cxx: 'gcc12',           gpu: 'h100', sm: 'gpu' }
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'}
     # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly.
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4'}
+# Moving this job to v100 while runners team investigates issues on t4 pool.
+#    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'v100'}
     # default_projects: clang-cuda
     - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'}
     - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'}
@@ -61,8 +69,11 @@ workflows:
   nightly:
     # Edge-case jobs
     - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 't4'}
+ # Moving these jobs to v100 while runners team investigates issues on t4 pool.
+    # - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4'}
+    # - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 't4'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'v100'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'v100'}
     # Old CTK/compiler
     - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']}
     - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'}
@@ -72,12 +83,14 @@ workflows:
     - {jobs: ['build'], std: 'all', cxx: ['msvc2019']}
     # Test current CTK
     - {jobs: ['test_lid0'],                             project: 'cub',    std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100'}
-    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',    std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4'}
+# Moving this job to v100 while runners team investigates issues on t4 pool.
+#    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',    std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4'}
+    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',    std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100'}
     - {jobs: ['test_lid0'],                             project: 'cub',    std: 'max', cxx: 'gcc12',                          gpu: 'h100', sm: 'gpu' }
-    - {jobs: ['test'],                                  project: 'thrust', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4'}
-    # Switching to v100 temporarily while investigating a timeout in heterogeneous/barrier*.pass.cpp
-    # - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'rtx2080', sm: 'gpu'}
-    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100'}
+# Moving this job to v100 while runners team investigates issues on t4 pool.
+#    - {jobs: ['test'],                                  project: 'thrust', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4'}
+    - {jobs: ['test'],                                  project: 'thrust', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100'}
+    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'rtx2080'}
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}

From ea4d40e93a8c5904ac3f5b433182c9386c15a3b5 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Wed, 29 Jan 2025 15:11:40 -0500
Subject: [PATCH 9/9] Switch to all rtx queues:

CUB -> RTXA6000 (48GiB)
Thrust -> RTX4090 (24GiB)
Others -> RTX2080 (8GiB)
---
 ci/matrix.yaml | 42 ++++++++++++++----------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 77bb9785649..5ec715fb59b 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -19,29 +19,22 @@ workflows:
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
- # Moving this job to v100 while runners team investigates issues on t4 pool.
- #   - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 't4'}
-    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 'v100'}
+    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx4090'}
     - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080'}
     # Disabled until we figure out the issue with the TBB dll
-   #- {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['msvc'], gpu: 't4'}
+   #- {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['msvc'], gpu: 'rtx4090'}
     - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['msvc'], gpu: 'rtx2080'}
     # Split up cub tests:
-    - {jobs: ['test_lid0'],                            project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'v100'}
-# Moving these jobs to v100 while runners team investigates issues on t4 pool.
-#    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 't4'}
-#    - {jobs: ['test_nolid', 'test_lid0'],              project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 't4'}
-    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'v100'}
-    - {jobs: ['test_nolid', 'test_lid0'],              project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 'v100'}
-    - {jobs: ['test_lid0'],                            project: ['cub'], std: 'max', cxx: 'gcc12',           gpu: 'h100', sm: 'gpu' }
+    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'rtxa6000'}
+    - {jobs: ['test_lid1', 'test_lid2'],  project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'rtxa6000'}
+    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 'rtxa6000'}
+    - {jobs: ['test_lid0'],               project: ['cub'], std: 'max', cxx: 'gcc12',           gpu: 'h100', sm: 'gpu' }
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'}
     # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly.
-# Moving this job to v100 while runners team investigates issues on t4 pool.
-#    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'v100'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
     # default_projects: clang-cuda
     - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'}
     - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'}
@@ -69,11 +62,8 @@ workflows:
   nightly:
     # Edge-case jobs
     - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080'}
- # Moving these jobs to v100 while runners team investigates issues on t4 pool.
-    # - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 't4'}
-    # - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 't4'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'v100'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'v100'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'rtx4090'}
     # Old CTK/compiler
     - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']}
     - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'}
@@ -82,15 +72,11 @@ workflows:
     - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'all', cxx: ['msvc2019']}
     # Test current CTK
-    - {jobs: ['test_lid0'],                             project: 'cub',    std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100'}
-# Moving this job to v100 while runners team investigates issues on t4 pool.
-#    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',    std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4'}
-    - {jobs: ['test_nolid', 'test_lid1', 'test_lid2'],  project: 'cub',    std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100'}
-    - {jobs: ['test_lid0'],                             project: 'cub',    std: 'max', cxx: 'gcc12',                          gpu: 'h100', sm: 'gpu' }
-# Moving this job to v100 while runners team investigates issues on t4 pool.
-#    - {jobs: ['test'],                                  project: 'thrust', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 't4'}
-    - {jobs: ['test'],                                  project: 'thrust', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'v100'}
-    - {jobs: ['test'],                                  project: 'libcudacxx', std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022'], gpu: 'rtx2080'}
+    - {jobs: ['test'],      project: 'cub',        std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'}
+    - {jobs: ['test_lid0'], project: 'cub',        std: 'max', cxx: 'gcc',                    gpu: 'v100'}
+    - {jobs: ['test_lid0'], project: 'cub',        std: 'max', cxx: 'gcc',                    gpu: 'h100', sm: 'gpu' }
+    - {jobs: ['test'],      project: 'thrust',     std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
+    - {jobs: ['test'],      project: 'libcudacxx', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}