diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh
index 1629836d216..c1425f2ab1a 100755
--- a/.github/actions/compute-matrix/compute-matrix.sh
+++ b/.github/actions/compute-matrix/compute-matrix.sh
@@ -13,7 +13,7 @@ explode_std_versions() {
 }
 
 explode_libs() {
-  jq -cr 'map(. as $o | {lib: $o.lib[]} + del($o.lib))'
+  jq -cr 'map(. as $o | {project: $o.project[]} + del($o.project))'
 }
 
 # Filter out the libraries that are dirty
@@ -34,7 +34,10 @@ filter_libs() {
   # echo "Dirty libraries: ${dirty_libs[@]}" >> /dev/stderr
 
   # Construct a regex to filter out the dirty libraries
-  dirty_lib_regex=$(IFS="|"; echo "${dirty_libs[*]}")
+  dirty_lib_regex=$(
+    IFS="|"
+    echo "${dirty_libs[*]}"
+  )
   dirty_lib_regex="^(${dirty_lib_regex})\$"
   jq_filter="map(select(.lib | test(\"$dirty_lib_regex\")))"
   jq -cr "$jq_filter"
@@ -44,39 +47,89 @@ extract_matrix() {
   local file="$1"
   local type="$2"
   local matrix=$(yq -o=json "$file" | jq -cr ".$type")
-  write_output "DEVCONTAINER_VERSION" "$(yq -o json "$file" | jq -cr '.devcontainer_version')"
+  local devcontainer_version=$(yq -o json "$file" | jq -cr '.devcontainer_version')
 
-  local nvcc_full_matrix="$(echo "$matrix" | jq -cr '.nvcc' | explode_std_versions )"
-  local per_cuda_compiler_matrix="$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
-  write_output "PER_CUDA_COMPILER_MATRIX"  "$per_cuda_compiler_matrix"
-  write_output "PER_CUDA_COMPILER_KEYS" "$(echo "$per_cuda_compiler_matrix" | jq -r 'keys | @json')"
+  write_output "DEVCONTAINER_VERSION" "$devcontainer_version"
 
-  write_output "NVRTC_MATRIX" "$(echo "$matrix" | jq '.nvrtc' | explode_std_versions)"
+  local full_matrix="$(echo "$matrix" | explode_std_versions | explode_libs)" # | filter_libs)"
 
-  local clang_cuda_matrix="$(echo "$matrix" | jq -cr '.["clang-cuda"]' | explode_std_versions | explode_libs | filter_libs)"
-  write_output "CLANG_CUDA_MATRIX" "$clang_cuda_matrix"
-  write_output "CCCL_INFRA_MATRIX" "$(echo "$matrix" | jq -cr '.["cccl-infra"]' )"
+  create_dispatch_job() {
+    local input_object="$1"
+    local output_object=$(.github/actions/compute-matrix/generate-ci-dispatch-job.sh "$devcontainer_version" "$input_object")
+    echo "$output_object"
+  }
+
+  # For each json object in the nvcc matrix array, accumulate the output json object from `generate-ci-dispatch-job.sh`
+  # into a single json object. If duplicate keys are found, merge the matching value arrays.
+  local wf_json="{}"
+  local input_objects=$(echo "$full_matrix" | jq -cr '.[]')
+  for input_object in $input_objects; do
+    local dispatch_job=$(create_dispatch_job "$input_object")
+    wf_json=$(
+      jq --argjson accum "$wf_json" --argjson new "$dispatch_job" '
+        $new | . as $input |
+        reduce keys[] as $key (
+            $accum;
+            if .[$key] then
+                .[$key] |= (
+                    reduce ($input[$key] | keys_unsorted[]) as $nestedKey (.;
+                        .[$nestedKey] += $input[$key][$nestedKey]
+                    )
+                )
+            else
+                .[$key] = $input[$key]
+            end
+        )
+    ' <<<'{}')
+  done
+
+  wf_keys=$(echo "$wf_json" | jq -r 'keys | @json')
+
+  echo "wf_json:" >&2
+  echo "$wf_json" | jq '.' >&2
+
+  write_output "WORKFLOW" "$(echo "$wf_json" | jq -c '.')"
+  write_output "WORKFLOW_KEYS" "$wf_keys"
 }
 
+# local per_cuda_compiler_matrix="$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
+# write_output "PER_CUDA_COMPILER_MATRIX"  "$per_cuda_compiler_matrix"
+# write_output "PER_CUDA_COMPILER_KEYS" "$(echo "$per_cuda_compiler_matrix" | jq -r 'keys | @json')"
+
+# write_output "NVRTC_MATRIX" "$(echo "$matrix" | jq '.nvrtc' | explode_std_versions)"
+
+# local clang_cuda_matrix="$(echo "$matrix" | jq -cr '.["clang-cuda"]' | explode_std_versions | explode_libs | filter_libs)"
+# write_output "CLANG_CUDA_MATRIX" "$clang_cuda_matrix"
+# write_output "CCCL_INFRA_MATRIX" "$(echo "$matrix" | jq -cr '.["cccl-infra"]' )"
+# }
+
 main() {
   if [ "$1" == "-v" ]; then
     set -x
     shift
   fi
 
-  if [ $# -ne 2 ] || [ "$2" != "pull_request" ]; then
+  if [ $# -ne 2 ]; then
     echo "Usage: $0 [-v] MATRIX_FILE MATRIX_TYPE"
     echo "  -v            : Enable verbose output"
     echo "  MATRIX_FILE   : The path to the matrix file."
-    echo "  MATRIX_TYPE   : The desired matrix. Supported values: 'pull_request'"
+    echo "  MATRIX_TYPE   : The desired matrix."
+    exit 1
+  fi
+
+  local matrix_file="$1"
+  local matrix_type="$2"
+
+  if [[ ! -f "$matrix_file" ]]; then
+    echo "Error: The matrix file '$matrix_file' does not exist."
     exit 1
   fi
 
   echo "Input matrix file:" >&2
-  cat "$1" >&2
-  echo "Matrix Type: $2" >&2
+  cat "$matrix_file" >&2
+  echo "Matrix Type: $matrix_type" >&2
 
-  extract_matrix "$1" "$2"
+  extract_matrix "$matrix_file" "$matrix_type"
 }
 
 main "$@"
diff --git a/.github/actions/compute-matrix/generate-ci-dispatch-job.sh b/.github/actions/compute-matrix/generate-ci-dispatch-job.sh
new file mode 100755
index 00000000000..60ba4f1a7d3
--- /dev/null
+++ b/.github/actions/compute-matrix/generate-ci-dispatch-job.sh
@@ -0,0 +1,342 @@
+#!/bin/bash
+
+set -euo pipefail
+
+usage() {
+  echo "Usage: $0 <devcontainer_version> '<job_json>'"
+  echo
+  echo "Generates a json object that is dispatchable using the ci-dispatch-job workflow."
+  echo
+  echo "Options:"
+  echo "  devcontainer_version: The version of the rapidsai devcontainer to use"
+  echo "  job_json: JSON string containing the job parameters"
+  echo
+  echo "Example:"
+  echo "  $0 <devcontainer_version> '{"
+  echo "    \"project\": \"cub\","
+  echo "    \"ctk\": \"11.8\","
+  echo "    \"device_compiler\": \"nvcc\","
+  echo "    \"host_compiler\": { \"name\": \"gcc\", \"version\": \"11\", \"exe\": \"gcc-11\" },"
+  echo "    \"job_types\": [\"build\", \"test\"],"
+  echo "    \"std\": 17,"
+  echo "    \"cuda_compile_arch\": \"60;70;80\","
+  echo "    \"cmake_options\": { \"-DCMAKE_BUILD_TYPE\": \"Debug\" },"
+  echo "    \"cpu\": \"amd64\","
+  echo "    \"gpu\": \"v100\","
+  echo "    \"os\": \"ubuntu22.04\","
+  echo "  }'"
+}
+
+readonly devcontainer_version="$1"
+readonly input_json=$(echo "$2" | jq -c '.')
+
+# echo "devcontainer_version: ${devcontainer_version}"
+# echo "Input JSON:"
+# echo ${input_json} | jq '.'
+# echo
+
+validate_input_json() {
+  if [[ -z "${input_json}" ]]; then
+    echo "Error: input_json is empty"
+    exit 1
+  fi
+
+  required_keys=("project" "ctk" "host_compiler" "job_types" "std" "os")
+
+  for key in "${required_keys[@]}"; do
+    if [[ "$(echo "$input_json" | jq -r ".$key")" == "null" ]]; then
+      echo "Error: ${key} is required"
+      exit 1
+    fi
+  done
+}
+validate_input_json
+
+declare -A project_names
+project_names["thrust"]="Thrust"
+project_names["cub"]="CUB"
+project_names["libcudacxx"]="libcudacxx"
+
+readonly project=$(echo "$input_json" | jq -r '.project')
+readonly project_name="${project_names[$project]}"
+readonly ctk=$(echo "$input_json" | jq -r '.ctk')
+readonly device_compiler_json=$(echo "$input_json" | jq -r '.device_compiler // "nvcc"')
+readonly host_compiler=$(echo "$input_json" | jq -r '.host_compiler')
+readonly job_types=($(echo "$input_json" | jq '.job_types[]' | xargs))
+
+readonly host_compiler_name=$(echo "$host_compiler" | jq -r '.name')
+readonly host_compiler_version=$(echo "$host_compiler" | jq -r '.version')
+readonly host_compiler_exe=$(echo "$host_compiler" | jq -r '.exe')
+
+# Shortcuts for CTK-packaged device compilers:
+if [[ "${device_compiler_json}" == "nvcc" || "${device_compiler_json}" == "nvrtc" ]]; then
+  readonly device_compiler_name=${device_compiler_json}
+  readonly device_compiler_version=${ctk}
+  readonly device_compiler_exe=${device_compiler_json}
+else
+  readonly device_compiler_name=$(echo "$device_compiler_json" | jq -r '.name')
+  readonly device_compiler_version=$(echo "$device_compiler_json" | jq -r '.version')
+  readonly device_compiler_exe=$(echo "$device_compiler_json" | jq -r '.exe')
+fi
+
+readonly std=$(echo "$input_json" | jq -r '.std')
+readonly cuda_compile_arch=$(echo "$input_json" | jq -r '.cuda_compile_arch // ""')
+readonly cmake_options=$(echo "$input_json" | jq -r '.cmake_options // ""')
+readonly cpu=$(echo "$input_json" | jq -r '.cpu // "amd64"')
+readonly gpu=$(echo "$input_json" | jq -r '.gpu // "v100"')
+readonly os=$(echo "$input_json" | jq -r '.os')
+
+# There are two types of job:
+#
+# - standalone_job
+# - two_stage_job
+#
+# Standalone jobs invokes the run_as_coder workflow directly.
+#
+# The two stage jobs invokes two_stage_workflow and specify M producers and N consumers,
+# usually a single build step followed by one or more tests steps that use the build artifacts.
+# Consumers and producers each invoke the run_as_coder workflow, and the groups execute with
+# bulk synchronous parallelism -- each producer must finish before any consumer can start.
+#
+# Identify the supported jobs:
+#
+# - build_and_test (two_stage): A testing job_type is specified.
+# - build_only (standalone): The "build" job_type is specified with no testing job types.
+# - nvrtc_only (standalone): The "nvrtc" job_type is specified.
+#
+# Currently the only test job supported is "test", but this can be extended to support other jobs
+# like compute-sanitizer, etc.
+
+# Job types that require a build step:
+test_job_types=("test")
+
+# Job types that require a GPU runner:
+gpu_job_types=("test", "nvrtc")
+
+readonly top_level_group="${project_name} CUDA${ctk} ${device_compiler_name}-${device_compiler_version}"
+
+has_test_jobs() {
+  for test_job_type in "${test_job_types[@]}"; do
+    # If test_job_type appears in job_type
+    if [[ ${job_types[@]} == *"${test_job_type}"* ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+is_windows() {
+  if [[ ${host_compiler_name} == "cl" ]]; then
+    return 0
+  fi
+  return 1
+}
+
+format_job_type() {
+  local job_type="$1"
+
+  # Special cases:
+  if [[ "${job_type}" == "nvrtc" ]]; then
+    echo "NVRTC"
+  else
+    # Otherwise capitalize the first letter:
+    echo "${job_type^}"
+  fi
+}
+
+# If the gpu runner is from the testing pool
+is_testing_pool() {
+  if [[ ${gpu} == "v100" ]]; then
+    return 1
+  fi
+  return 0
+}
+
+job_type_requires_gpu() {
+  local job_type="$1"
+  if [[ ${gpu_job_types[@]} == *"${job_type}"* ]]; then
+    return 0
+  fi
+  return 1
+}
+
+get_job_name() {
+  local job_type="$1"
+
+  local formatted_job_type=$(format_job_type "${job_type}")
+  local gpu_str=""
+  if job_type_requires_gpu "${job_type}"; then
+    gpu_str="${gpu^^}"
+  fi
+
+  job_name="${formatted_job_type}: ${host_compiler_name}-${host_compiler_version} C++${std} ${cpu}"
+  if [[ -n "${gpu_str}" ]]; then
+    job_name="${job_name} ${gpu_str}"
+  fi
+  if [[ -n "${cuda_compile_arch}" ]]; then
+    job_name="${job_name} sm{${cuda_compile_arch}}"
+  fi
+  if [[ -n "${cmake_options}" ]]; then
+    job_name="${job_name} ${cmake_options}"
+  fi
+
+  echo "${job_name}"
+}
+
+runner_os() {
+  if [[ $(is_windows) -eq 1 ]]; then
+    echo "windows"
+  else
+    echo "linux"
+  fi
+}
+
+cpu_runner() {
+  echo "$(runner_os)-${cpu}-cpu16"
+}
+
+gpu_runner() {
+  if is_testing_pool; then
+    echo "$(runner_os)-${cpu}-gpu-${gpu}-latest-l-testing"
+  else
+    echo "$(runner_os)-${cpu}-gpu-${gpu}-latest-1"
+  fi
+}
+
+image_name() {
+  echo "rapidsai/devcontainers:${devcontainer_version}-cpp-${host_compiler_name}${host_compiler_version}-cuda${ctk}-${os}"
+}
+
+script_name() {
+  local job_type=$1
+
+  if [[ $(is_windows) -eq 1 ]]; then
+    echo "./ci/windows/${job_type}_${project}.ps1"
+  else
+    echo "./ci/${job_type}_${project}.sh"
+  fi
+}
+
+build_command_argv() {
+  local -n command_argv=$1
+  local job_type=$2
+
+  command_argv=()
+  command_argv+=("\"$(script_name ${job_type})\"")
+  command_argv+=("-std" "${std}")
+  if ! is_windows; then
+    command_argv+=("-cxx" "\"${host_compiler_exe}\"")
+  fi
+  if [[ -n "${cuda_compile_arch}" ]]; then
+    command_argv+=("-arch" "\"${cuda_compile_arch}\"")
+  fi
+  if [[ "${device_compiler_name}" != "nvcc" && ${device_compiler_name} != "nvrtc" ]]; then
+    command_argv+=("-cuda" "\"${device_compiler_exe}\"")
+  fi
+  cmake_args=$(echo "$cmake_options" | jq -r 'to_entries | map("\(.key)=\(.value)") | join(" ")')
+  if [[ -n "${cmake_args}" ]]; then
+    command_argv+=("-cmake-options" "\"${cmake_args}\"")
+  fi
+}
+
+build_run_as_coder_job() {
+  local job_type=$1
+
+  local job_name=$(get_job_name "${job_type}")
+
+  if job_type_requires_gpu "${job_type}"; then
+    local runner=$(gpu_runner)
+  else
+    local runner=$(cpu_runner)
+  fi
+
+  local image=$(image_name)
+  local command=()
+  build_command_argv command "${job_type}"
+
+
+  local job_json=$(jq -cn \
+    --arg name "${job_name}" \
+    --arg runner "${runner}" \
+    --arg image "${image}" \
+    --arg command "${command[*]}" \
+    '{name: $name, runner: $runner, image: $image, command: $command}')
+
+  echo "${job_json}"
+}
+
+append_json_object_to_array() {
+  local -n array_json=$1
+  local object_json=$2
+  array_json=$(
+    jq -cn --argjson array_json "${array_json}" --argjson object_json "${object_json}" \
+    '$array_json + [$object_json]'
+  )
+}
+
+add_stand_alone_workflow() {
+  local -n wf_json=$1
+  local job_type=$2
+  local job_json=$(build_run_as_coder_job "${job_type}")
+  append_json_object_to_array wf_json "${job_json}"
+}
+
+add_build_and_test_workflow() {
+  local -n wf_json=$1
+  local test_job_type=$2
+  local producer_json=$(build_run_as_coder_job "build")
+  local consumer_json=$(build_run_as_coder_job "${test_job_type}" "${gpu}")
+
+  # If producer is already in the workflow, append the consumer to the existing producer:
+  local found_producer=$(
+    jq -n --argjson wf_json "${wf_json}" --argjson producer_json "${producer_json}" \
+    '$wf_json | .[] | select(.producers[0].name == $producer_json.name)'
+  )
+  if [[ -n "${found_producer}" ]]; then
+    wf_json=$(
+      jq -n --argjson wf_json "${wf_json}" --argjson consumer_json "${consumer_json}" \
+      '$wf_json | map(if .producers[0].name == $consumer_json.name then .consumers += [$consumer_json] else . end)'
+    )
+  else
+    wf_json=$(
+      jq -n --argjson wf_json "${wf_json}" --argjson producer_json "${producer_json}" --argjson consumer_json "${consumer_json}" \
+      '$wf_json + [{producers: [$producer_json], consumers: [$consumer_json]}]'
+    )
+  fi
+}
+
+main() {
+  local standalone_workflow_json="[]"
+  local two_stage_workflows="[]"
+
+  for type in "${job_types[@]}"; do
+    if [[ "${type}" == "build" ]]; then
+      if has_test_jobs; then
+        continue # Build job will be added by test jobs.
+      fi
+      append_json_object_to_array standalone_workflow_json "$(build_run_as_coder_job "build")"
+    elif [[ "${type}" == "test" ]]; then
+      add_build_and_test_workflow two_stage_workflows "test"
+    elif [[ "${type}" == "nvrtc" ]]; then
+      append_json_object_to_array standalone_workflow_json "$(build_run_as_coder_job "nvrtc")"
+    else
+      echo "Unsupported job type: ${type}"
+      exit 1
+    fi
+  done
+
+  local output_json=$(jq -n \
+    --argjson standalone_workflow_json "${standalone_workflow_json}" \
+    --argjson two_stage_workflows "${two_stage_workflows}" \
+    '{standalone: $standalone_workflow_json, two_stage: $two_stage_workflows}')
+
+  # Uncomment to insert the input json under the key "input"
+  # output_json=$(
+    # jq -n --argjson output_json "${output_json}" --argjson input_json "${input_json}" \
+    # '$output_json + {input: $input_json}'
+  # )
+
+  echo "{ \"${top_level_group}\": ${output_json} }" | jq '.'
+}
+
+main
diff --git a/.github/workflows/ci-dispatch-group.yml b/.github/workflows/ci-dispatch-group.yml
new file mode 100644
index 00000000000..db9e260716d
--- /dev/null
+++ b/.github/workflows/ci-dispatch-group.yml
@@ -0,0 +1,48 @@
+name: "CI/Dispatch/Group"
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      name: {type: string, required: true}
+      jobs: {type: string, required: true}
+
+permissions:
+  contents: read
+
+jobs:
+  standlone-jobs:
+    name: ""
+    if: fromJSON(inputs.jobs).standalone
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{fromJSON(inputs.jobs).standalone}}
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      name:   ${{ matrix.name }}
+      runner: ${{ matrix.runner }}
+      image:  ${{ matrix.image }}
+      command: |
+        ${{ matrix.command }}
+
+  two-stage-jobs:
+    name: ""
+    if: fromJSON(inputs.jobs).two_stage
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{fromJSON(inputs.jobs).two_stage}}
+    uses: ./.github/workflows/ci-dispatch-two-stage.yml
+    with:
+      producers: ${{ toJSON(matrix.producers) }}
+      consumers: ${{ toJSON(matrix.consumers) }}
diff --git a/.github/workflows/ci-dispatch-two-stage.yml b/.github/workflows/ci-dispatch-two-stage.yml
new file mode 100644
index 00000000000..b7e233111f9
--- /dev/null
+++ b/.github/workflows/ci-dispatch-two-stage.yml
@@ -0,0 +1,52 @@
+name: "CI/Dispatch/TwoStage"
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      producers: {type: string, required: true}
+      consumers: {type: string, required: true}
+
+permissions:
+  contents: read
+
+jobs:
+  producers:
+    name: ${{ matrix.name }}
+    if: fromJSON(inputs.producers)
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{fromJSON(inputs.producers)}}
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      name:   ${{ matrix.name }}
+      runner: ${{ matrix.runner }}
+      image:  ${{ matrix.image }}
+      command: |
+        ${{ matrix.command }}
+
+  consumers:
+    name: ${{ matrix.name }}
+    if: fromJSON(inputs.consumers)
+    needs: producers
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{fromJSON(inputs.consumers)}}
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      name:   ${{ matrix.name }}
+      runner: ${{ matrix.runner }}
+      image:  ${{ matrix.image }}
+      command: |
+        ${{ matrix.command }}
diff --git a/.github/workflows/dispatch-build-and-test.yml b/.github/workflows/dispatch-build-and-test.yml
index 7b5ed4ef272..3f8227152ed 100644
--- a/.github/workflows/dispatch-build-and-test.yml
+++ b/.github/workflows/dispatch-build-and-test.yml
@@ -4,6 +4,7 @@ on:
   workflow_call:
     inputs:
       project_name: {type: string, required: true}
+      job_type: {type: string, required: true}
       per_cuda_compiler_matrix: {type: string, required: true}
       devcontainer_version: {type: string, required: true}
       is_windows: {type: boolean, required: true}
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
new file mode 100644
index 00000000000..bdfdf490a01
--- /dev/null
+++ b/.github/workflows/nightly.yml
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the main workflow that runs on every PR and push to main
+name: nightly
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+on:
+  # FIXME: This should be a cron job that runs nightly
+  push: # Testing only
+    branches:
+      - "pull-request/[0-9]+"
+  # schedule:
+  #   - cron: '0 7 * * *' # 7AM UTC, 12AM PST, 3AM EST
+
+# Only runs one instance of this workflow at a time for a given PR and cancels any in-progress runs when a new one starts.
+concurrency:
+  group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  compute-matrix:
+    name: Compute matrix
+    runs-on: ubuntu-latest
+    outputs:
+      DEVCONTAINER_VERSION: ${{steps.compute-matrix.outputs.DEVCONTAINER_VERSION}}
+      WORKFLOW: ${{steps.compute-matrix.outputs.WORKFLOW}}
+      WORKFLOW_KEYS: ${{steps.compute-matrix.outputs.WORKFLOW_KEYS}}
+    steps:
+      - name: Get Base Branch from PR
+        id: get-pr-info
+        uses: nv-gha-runners/get-pr-info@main
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Identify dirty subprojects
+        id: inspect-changes
+        run: |
+          ./ci/inspect_changes.sh ${BASE_SHA} ${GITHUB_SHA}
+        env:
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+      - name: Compute matrix outputs
+        id: compute-matrix
+        run: |
+          .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml nightly
+        env:
+          THRUST_DIRTY: ${{ steps.inspect-changes.outputs.THRUST_DIRTY }}
+          CUB_DIRTY: ${{ steps.inspect-changes.outputs.CUB_DIRTY }}
+          LIBCUDACXX_DIRTY: ${{ steps.inspect-changes.outputs.LIBCUDACXX_DIRTY }}
+
+  dispatch-groups:
+    name: ${{ matrix.name }}
+    needs:
+      - compute-matrix
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        name: ${{ fromJSON(needs.compute-matrix.outputs.WORKFLOW_KEYS) }}
+    uses: ./.github/workflows/ci-dispatch-group.yml
+    with:
+      name: ${{ matrix.name }}
+      jobs: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.WORKFLOW)[ matrix.name ]) }}
+
+  # This job is the final job that runs after all other jobs and is used for branch protection status checks.
+  # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
+  # https://github.com/orgs/community/discussions/26822#discussioncomment-5122101
+  ci:
+    runs-on: ubuntu-latest
+    name: CI
+    if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
+    needs:
+      - dispatch-groups
+    steps:
+      - name: Check status of all precursor jobs
+        if: >-
+          ${{contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled')}}
+        run: exit 1
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 3dcee0cf6c6..2d268184026 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -35,13 +35,13 @@ permissions:
   pull-requests: read
 
 jobs:
-  inspect-changes:
-    name: "Inspect Changes"
+  compute-matrix:
+    name: Compute matrix
     runs-on: ubuntu-latest
     outputs:
-      LIBCUDACXX_DIRTY: ${{ steps.set-outputs.outputs.LIBCUDACXX_DIRTY }}
-      CUB_DIRTY: ${{ steps.set-outputs.outputs.CUB_DIRTY }}
-      THRUST_DIRTY: ${{ steps.set-outputs.outputs.THRUST_DIRTY }}
+      DEVCONTAINER_VERSION: ${{steps.compute-matrix.outputs.DEVCONTAINER_VERSION}}
+      WORKFLOW: ${{steps.compute-matrix.outputs.WORKFLOW}}
+      WORKFLOW_KEYS: ${{steps.compute-matrix.outputs.WORKFLOW_KEYS}}
     steps:
       - name: Get Base Branch from PR
         id: get-pr-info
@@ -49,175 +49,35 @@ jobs:
       - name: Checkout repo
         uses: actions/checkout@v3
       - name: Identify dirty subprojects
-        id: set-outputs
+        id: inspect-changes
         run: |
           ./ci/inspect_changes.sh ${BASE_SHA} ${GITHUB_SHA}
         env:
           BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
-
-  compute-matrix:
-    name: Compute matrix
-    runs-on: ubuntu-latest
-    needs:
-      - inspect-changes
-    outputs:
-      DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
-      PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
-      PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
-      NVRTC_MATRIX: ${{steps.set-outputs.outputs.NVRTC_MATRIX}}
-      CLANG_CUDA_MATRIX: ${{steps.set-outputs.outputs.CLANG_CUDA_MATRIX}}
-      CCCL_INFRA_MATRIX: ${{steps.set-outputs.outputs.CCCL_INFRA_MATRIX}}
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
       - name: Compute matrix outputs
-        id: set-outputs
+        id: compute-matrix
         run: |
           .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
         env:
-          THRUST_DIRTY: ${{ needs.inspect-changes.outputs.THRUST_DIRTY }}
-          CUB_DIRTY: ${{ needs.inspect-changes.outputs.CUB_DIRTY }}
-          LIBCUDACXX_DIRTY: ${{ needs.inspect-changes.outputs.LIBCUDACXX_DIRTY }}
+          THRUST_DIRTY: ${{ steps.inspect-changes.outputs.THRUST_DIRTY }}
+          CUB_DIRTY: ${{ steps.inspect-changes.outputs.CUB_DIRTY }}
+          LIBCUDACXX_DIRTY: ${{ steps.inspect-changes.outputs.LIBCUDACXX_DIRTY }}
 
-  nvrtc:
-    name: libcudacxx NVRTC CUDA${{matrix.cuda}}
-    permissions:
-      id-token: write
-      contents: read
-    needs:
-      - compute-matrix
-      - inspect-changes
-    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') && needs.inspect-changes.outputs.LIBCUDACXX_DIRTY == 'true' }}
-    uses: ./.github/workflows/run-as-coder.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        include: ${{ fromJSON(needs.compute-matrix.outputs.NVRTC_MATRIX) }}
-    with:
-      name: Build and Test libcudacxx CUDA${{matrix.cuda}} C++${{matrix.std}}
-      runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
-      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-gcc12-cuda${{matrix.cuda}}-${{matrix.os}}
-      command: |
-        ./ci/nvrtc_libcudacxx.sh -cxx g++ -std ${{matrix.std}}
-
-  thrust:
-    name: Thrust CUDA${{ matrix.cuda_host_combination }}
-    permissions:
-      id-token: write
-      contents: read
+  dispatch-groups:
+    name: ${{ matrix.name }}
     needs:
       - compute-matrix
-      - inspect-changes
-    if: ${{ needs.inspect-changes.outputs.THRUST_DIRTY == 'true' }}
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
-    with:
-      project_name: "thrust"
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-      is_windows: ${{ contains(matrix.cuda_host_combination, 'cl') }}
-
-  cub:
-    name: CUB CUDA${{ matrix.cuda_host_combination }}
     permissions:
       id-token: write
       contents: read
-    needs:
-      - compute-matrix
-      - inspect-changes
-    if: ${{ needs.inspect-changes.outputs.CUB_DIRTY == 'true' }}
-    uses: ./.github/workflows/dispatch-build-and-test.yml
     strategy:
       fail-fast: false
       matrix:
-        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+        name: ${{ fromJSON(needs.compute-matrix.outputs.WORKFLOW_KEYS) }}
+    uses: ./.github/workflows/ci-dispatch-group.yml
     with:
-      project_name: "cub"
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-      is_windows: ${{ contains(matrix.cuda_host_combination, 'cl') }}
-
-  libcudacxx:
-    name: libcudacxx CUDA${{ matrix.cuda_host_combination }}
-    permissions:
-      id-token: write
-      contents: read
-    needs:
-      - compute-matrix
-      - inspect-changes
-    if: ${{ needs.inspect-changes.outputs.LIBCUDACXX_DIRTY == 'true' }}
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
-    with:
-      project_name: "libcudacxx"
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-      is_windows: ${{ contains(matrix.cuda_host_combination, 'cl') }}
-
-  clang-cuda:
-    name: ${{matrix.lib}} Clang CUDA
-    permissions:
-      id-token: write
-      contents: read
-    needs: compute-matrix
-    strategy:
-      fail-fast: false
-      matrix:
-        include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
-    uses: ./.github/workflows/run-as-coder.yml
-    with:
-      name: Build ${{matrix.lib}} ${{matrix.cpu}}/clang-cuda${{matrix.compiler.version}}/C++${{matrix.std}}
-      runner: linux-${{matrix.cpu}}-cpu16
-      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
-      command: |
-        ./ci/build_${{matrix.lib}}.sh -cxx "${{matrix.compiler.exe}}" -cuda "${{matrix.compiler.exe}}" -std "${{matrix.std}}"
-
-  cccl-infra:
-    name: CCCL Infrastructure
-    permissions:
-      id-token: write
-      contents: read
-    needs: compute-matrix
-    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include: ${{ fromJSON(needs.compute-matrix.outputs.CCCL_INFRA_MATRIX) }}
-    uses: ./.github/workflows/run-as-coder.yml
-    with:
-      name: CCCL Examples CUDA${{matrix.cuda}} ${{matrix.compiler.name}}${{matrix.compiler.version}}
-      runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
-      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
-      command: |
-        cmake -S . --preset=cccl-infra -DCCCL_EXAMPLE_CPM_TAG=${GITHUB_SHA}
-        ctest --preset=cccl-infra
-
-  verify-devcontainers:
-    name: Verify Dev Containers
-    permissions:
-      id-token: write
-      contents: read
-    uses: ./.github/workflows/verify-devcontainers.yml
-
-  verify-codegen:
-    name: Verify Codegen in libcudacxx
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
-      - name: Run libcudacxx codegen verification
-        id: verify-codegen
-        run: |
-          sudo apt-get update
-          sudo apt-get install ninja-build
-          export CXX="g++"
-          ./ci/verify_codegen.sh
+      name: ${{ matrix.name }}
+      jobs: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.WORKFLOW)[ matrix.name ]) }}
 
   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
   # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
@@ -227,16 +87,216 @@ jobs:
     name: CI
     if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
     needs:
-      - clang-cuda
-      - cub
-      - libcudacxx
-      - nvrtc
-      - thrust
-      - cccl-infra
-      - verify-devcontainers
-      - verify-codegen
+      - dispatch-groups
     steps:
       - name: Check status of all precursor jobs
         if: >-
           ${{contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled')}}
         run: exit 1
+
+# jobs:
+#   inspect-changes:
+#     name: "Inspect Changes"
+#     runs-on: ubuntu-latest
+#     outputs:
+#       LIBCUDACXX_DIRTY: ${{ steps.set-outputs.outputs.LIBCUDACXX_DIRTY }}
+#       CUB_DIRTY: ${{ steps.set-outputs.outputs.CUB_DIRTY }}
+#       THRUST_DIRTY: ${{ steps.set-outputs.outputs.THRUST_DIRTY }}
+#     steps:
+#       - name: Get Base Branch from PR
+#         id: get-pr-info
+#         uses: nv-gha-runners/get-pr-info@main
+#       - name: Checkout repo
+#         uses: actions/checkout@v3
+#       - name: Identify dirty subprojects
+#         id: set-outputs
+#         run: |
+#           ./ci/inspect_changes.sh ${BASE_SHA} ${GITHUB_SHA}
+#         env:
+#           BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+
+#   compute-matrix:
+#     name: Compute matrix
+#     runs-on: ubuntu-latest
+#     needs:
+#       - inspect-changes
+#     outputs:
+#       DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
+#       PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
+#       PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
+#       NVRTC_MATRIX: ${{steps.set-outputs.outputs.NVRTC_MATRIX}}
+#       CLANG_CUDA_MATRIX: ${{steps.set-outputs.outputs.CLANG_CUDA_MATRIX}}
+#       CCCL_INFRA_MATRIX: ${{steps.set-outputs.outputs.CCCL_INFRA_MATRIX}}
+#     steps:
+#       - name: Checkout repo
+#         uses: actions/checkout@v3
+#       - name: Compute matrix outputs
+#         id: set-outputs
+#         run: |
+#           .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
+#         env:
+#           THRUST_DIRTY: ${{ needs.inspect-changes.outputs.THRUST_DIRTY }}
+#           CUB_DIRTY: ${{ needs.inspect-changes.outputs.CUB_DIRTY }}
+#           LIBCUDACXX_DIRTY: ${{ needs.inspect-changes.outputs.LIBCUDACXX_DIRTY }}
+
+#   nvrtc:
+#     name: libcudacxx NVRTC CUDA${{matrix.cuda}}
+#     permissions:
+#       id-token: write
+#       contents: read
+#     needs:
+#       - compute-matrix
+#       - inspect-changes
+#     if: ${{ !contains(github.event.head_commit.message, 'skip-tests') && needs.inspect-changes.outputs.LIBCUDACXX_DIRTY == 'true' }}
+#     uses: ./.github/workflows/run-as-coder.yml
+#     strategy:
+#       fail-fast: false
+#       matrix:
+#         include: ${{ fromJSON(needs.compute-matrix.outputs.NVRTC_MATRIX) }}
+#     with:
+#       name: Build and Test libcudacxx CUDA${{matrix.cuda}} C++${{matrix.std}}
+#       runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
+#       image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-gcc12-cuda${{matrix.cuda}}-${{matrix.os}}
+#       command: |
+#         ./ci/nvrtc_libcudacxx.sh -cxx g++ -std ${{matrix.std}}
+
+#   thrust:
+#     name: Thrust CUDA${{ matrix.cuda_host_combination }}
+#     permissions:
+#       id-token: write
+#       contents: read
+#     needs:
+#       - compute-matrix
+#       - inspect-changes
+#     if: ${{ needs.inspect-changes.outputs.THRUST_DIRTY == 'true' }}
+#     uses: ./.github/workflows/dispatch-build-and-test.yml
+#     strategy:
+#       fail-fast: false
+#       matrix:
+#         cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+#     with:
+#       project_name: "thrust"
+#       per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+#       devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+#       is_windows: ${{ contains(matrix.cuda_host_combination, 'cl') }}
+
+#   cub:
+#     name: CUB CUDA${{ matrix.cuda_host_combination }}
+#     permissions:
+#       id-token: write
+#       contents: read
+#     needs:
+#       - compute-matrix
+#       - inspect-changes
+#     if: ${{ needs.inspect-changes.outputs.CUB_DIRTY == 'true' }}
+#     uses: ./.github/workflows/dispatch-build-and-test.yml
+#     strategy:
+#       fail-fast: false
+#       matrix:
+#         cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+#     with:
+#       project_name: "cub"
+#       per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+#       devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+#       is_windows: ${{ contains(matrix.cuda_host_combination, 'cl') }}
+
+#   libcudacxx:
+#     name: libcudacxx CUDA${{ matrix.cuda_host_combination }}
+#     permissions:
+#       id-token: write
+#       contents: read
+#     needs:
+#       - compute-matrix
+#       - inspect-changes
+#     if: ${{ needs.inspect-changes.outputs.LIBCUDACXX_DIRTY == 'true' }}
+#     uses: ./.github/workflows/dispatch-build-and-test.yml
+#     strategy:
+#       fail-fast: false
+#       matrix:
+#         cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+#     with:
+#       project_name: "libcudacxx"
+#       per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+#       devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+#       is_windows: ${{ contains(matrix.cuda_host_combination, 'cl') }}
+
+#   clang-cuda:
+#     name: ${{matrix.lib}} Clang CUDA
+#     permissions:
+#       id-token: write
+#       contents: read
+#     needs: compute-matrix
+#     strategy:
+#       fail-fast: false
+#       matrix:
+#         include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
+#     uses: ./.github/workflows/run-as-coder.yml
+#     with:
+#       name: Build ${{matrix.lib}} ${{matrix.cpu}}/clang-cuda${{matrix.compiler.version}}/C++${{matrix.std}}
+#       runner: linux-${{matrix.cpu}}-cpu16
+#       image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
+#       command: |
+#         ./ci/build_${{matrix.lib}}.sh -cxx "${{matrix.compiler.exe}}" -cuda "${{matrix.compiler.exe}}" -std "${{matrix.std}}"
+
+#   cccl-infra:
+#     name: CCCL Infrastructure
+#     permissions:
+#       id-token: write
+#       contents: read
+#     needs: compute-matrix
+#     if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
+#     strategy:
+#       fail-fast: false
+#       matrix:
+#         include: ${{ fromJSON(needs.compute-matrix.outputs.CCCL_INFRA_MATRIX) }}
+#     uses: ./.github/workflows/run-as-coder.yml
+#     with:
+#       name: CCCL Examples CUDA${{matrix.cuda}} ${{matrix.compiler.name}}${{matrix.compiler.version}}
+#       runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
+#       image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
+#       command: |
+#         cmake -S . --preset=cccl-infra -DCCCL_EXAMPLE_CPM_TAG=${GITHUB_SHA}
+#         ctest --preset=cccl-infra
+
+#   verify-devcontainers:
+#     name: Verify Dev Containers
+#     permissions:
+#       id-token: write
+#       contents: read
+#     uses: ./.github/workflows/verify-devcontainers.yml
+
+#   verify-codegen:
+#     name: Verify Codegen in libcudacxx
+#     runs-on: ubuntu-latest
+#     steps:
+#       - name: Checkout repo
+#         uses: actions/checkout@v3
+#       - name: Run libcudacxx codegen verification
+#         id: verify-codegen
+#         run: |
+#           sudo apt-get update
+#           sudo apt-get install ninja-build
+#           export CXX="g++"
+#           ./ci/verify_codegen.sh
+
+#   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
+#   # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
+#   # https://github.com/orgs/community/discussions/26822#discussioncomment-5122101
+#   ci:
+#     runs-on: ubuntu-latest
+#     name: CI
+#     if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
+#     needs:
+#       - clang-cuda
+#       - cub
+#       - libcudacxx
+#       - nvrtc
+#       - thrust
+#       - cccl-infra
+#       - verify-devcontainers
+#       - verify-codegen
+#     steps:
+#       - name: Check status of all precursor jobs
+#         if: >-
+#           ${{contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled')}}
+#        run: exit 1
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 589de44bd3c..335a7ea0114 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -1,12 +1,20 @@
+ctk_prev_min: &ctk_prev_min '11.1'
+ctk_prev_max:  &ctk_prev_max  '11.8'
+ctk_curr: &ctk_curr '12.4'
 
-cuda_prev_min: &cuda_prev_min '11.1'
-cuda_prev_max:  &cuda_prev_max  '11.8'
-cuda_curr: &cuda_curr '12.4'
+projects_common: &projects_common
+  - 'libcudacxx'
+  - 'cub'
+  - 'thrust'
 
-# The GPUs to test on
-gpus:
-  - 'a100'
-  - 'v100'
+gpus: &gpus
+  - 'v100'     # ??x: sm70, 32 GB
+  - 't4'       #  8x: sm75, 16 GB
+  - 'rtx2080'  #  8x: sm75,  8 GB
+  - 'rtxa6000' # 12x: sm86, 48 GB
+  - 'l4'       # 48x: sm89, 24 GB
+  - 'rtx4090'  # 10x: sm89, 24 GB
+  - 'h100'     # 16x: sm90, ?? GB
 
 # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
 devcontainer_version: '24.06'
@@ -44,7 +52,7 @@ oneapi: &oneapi { name: 'oneapi', version: '2023.2.0', exe: 'icpc' }
 
 # Each environment below will generate a unique build/test job
 # See the "compute-matrix" job in the workflow for how this is parsed and used
-# cuda: The CUDA Toolkit version
+# ctk: The CUDA Toolkit version
 # os: The operating system used
 # cpu: The CPU architecture
 # compiler: The compiler to use
@@ -56,40 +64,54 @@ oneapi: &oneapi { name: 'oneapi', version: '2023.2.0', exe: 'icpc' }
 
 # Configurations that will run for every PR
 pull_request:
-  nvcc:
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc6,     std: [11, 14],         jobs: ['build']}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7,     std: [11, 14, 17],     jobs: ['build']}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8,     std: [11, 14, 17],     jobs: ['build']}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9,     std: [11, 14, 17],     jobs: ['build']}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9,    std: [11, 14, 17],     jobs: ['build']}
-    - {cuda: *cuda_prev_min, os: 'windows2022', cpu: 'amd64', compiler: *msvc2017, std: [14, 17],         jobs: ['build']}
-    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [11, 14, 17],     jobs: ['build'], extra_build_args: '-cmake-options -DCMAKE_CUDA_ARCHITECTURES=90'}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7,     std: [11, 14, 17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8,     std: [11, 14, 17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9,     std: [11, 14, 17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10,    std: [11, 14, 17, 20], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [11, 14, 17, 20], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [11, 14, 17, 20], jobs: ['build'], extra_build_args: '-cmake-options -DCMAKE_CUDA_ARCHITECTURES=90a'}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [11, 14, 17, 20], jobs: ['build', 'test']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'arm64', compiler: *gcc12,    std: [11, 14, 17, 20], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9,    std: [11, 14, 17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10,   std: [11, 14, 17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11,   std: [11, 14, 17, 20], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12,   std: [11, 14, 17, 20], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13,   std: [11, 14, 17, 20], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14,   std: [11, 14, 17, 20], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15,   std: [11, 14, 17, 20], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16,   std: [11, 14, 17, 20], jobs: ['build', 'test']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'arm64', compiler: *llvm16,   std: [11, 14, 17, 20], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, std: [14, 17],         jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, std: [14, 17, 20],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *oneapi,   std: [11, 14, 17],     jobs: ['build']}
-  nvrtc:
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', std: [11, 14, 17, 20]}
-  clang-cuda:
-    - {lib: ['thrust', 'cub', 'libcudacxx'], cuda: *cuda_curr, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm-newest, std: [17, 20]}
-  cccl-infra:
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc-oldest}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm-oldest}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc-newest}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm-newest}
+  - {job_types: ['build'], ctk: *ctk_prev_min, os: 'ubuntu18.04', host_compiler: *gcc6,     std: [11, 14],         project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_prev_min, os: 'ubuntu18.04', host_compiler: *gcc7,     std: [11, 14, 17],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_prev_min, os: 'ubuntu18.04', host_compiler: *gcc8,     std: [11, 14, 17],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_prev_min, os: 'ubuntu18.04', host_compiler: *gcc9,     std: [11, 14, 17],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_prev_min, os: 'ubuntu18.04', host_compiler: *llvm9,    std: [11, 14, 17],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_prev_min, os: 'windows2022', host_compiler: *msvc2017, std: [14, 17],         project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_prev_max, os: 'ubuntu22.04', host_compiler: *gcc11,    std: [11, 14, 17],     project: *projects_common, cuda_compile_arch: '90'}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *gcc7,     std: [11, 14, 17],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *gcc8,     std: [11, 14, 17],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *gcc9,     std: [11, 14, 17],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *gcc10,    std: [11, 14, 17, 20], project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu22.04', host_compiler: *gcc11,    std: [11, 14, 17, 20], project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu22.04', host_compiler: *gcc12,    std: [11, 14, 17, 20], project: *projects_common, cuda_compile_arch: '90'}
+  - {job_types: ['test'],  ctk: *ctk_curr,     os: 'ubuntu22.04', host_compiler: *gcc12,    std: [11, 14, 17, 20], project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu22.04', host_compiler: *gcc12,    std: [11, 14, 17, 20], project: *projects_common, cpu: 'arm64'}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *llvm9,    std: [11, 14, 17],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *llvm10,   std: [11, 14, 17],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *llvm11,   std: [11, 14, 17, 20], project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *llvm12,   std: [11, 14, 17, 20], project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *llvm13,   std: [11, 14, 17, 20], project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu20.04', host_compiler: *llvm14,   std: [11, 14, 17, 20], project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu22.04', host_compiler: *llvm15,   std: [11, 14, 17, 20], project: *projects_common}
+  - {job_types: ['test'],  ctk: *ctk_curr,     os: 'ubuntu22.04', host_compiler: *llvm16,   std: [11, 14, 17, 20], project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu22.04', host_compiler: *llvm16,   std: [11, 14, 17, 20], project: *projects_common, cpu: 'arm64'}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'windows2022', host_compiler: *msvc2019, std: [14, 17],         project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'windows2022', host_compiler: *msvc2022, std: [14, 17, 20],     project: *projects_common}
+  - {job_types: ['build'], ctk: *ctk_curr,     os: 'ubuntu22.04', host_compiler: *oneapi,   std: [11, 14, 17],     project: *projects_common}
+  # nvrtc:
+  - {job_types: ['nvrtc'], project: ['libcudacxx'], ctk: *ctk_curr, os: 'ubuntu22.04', host_compiler: *gcc12, std: [11, 14, 17, 20]}
+  # clang-cuda:
+  - {job_types: ['build'], device_compiler: *llvm-newest, host_compiler: *llvm-newest, ctk: *ctk_curr, os: 'ubuntu22.04', std: [17, 20], project: *projects_common}
+  # cccl-infra:
+  # TODO:
+  # - {ctk: *ctk_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc-oldest}
+  # - {ctk: *ctk_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm-oldest}
+  # - {ctk: *ctk_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc-newest}
+  # - {ctk: *ctk_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm-newest}
+
+nightly:
+  - {job_types: ['test'],  ctk: *ctk_prev_min, gpu: 'v100',     cuda_compile_arch: '70-real', host_compiler: *gcc6,   std: [11],             project: *projects_common, os: 'ubuntu18.04'}
+  - {job_types: ['test'],  ctk: *ctk_prev_min, gpu: 't4',       cuda_compile_arch: '75-real', host_compiler: *llvm9,  std: [17],             project: *projects_common, os: 'ubuntu18.04'}
+  - {job_types: ['test'],  ctk: *ctk_prev_max, gpu: 'rtx2080',  cuda_compile_arch: '75-real', host_compiler: *gcc11,  std: [17],             project: *projects_common, os: 'ubuntu22.04'}
+  - {job_types: ['test'],  ctk: *ctk_curr,     gpu: 'rtxa6000', cuda_compile_arch: '86-real', host_compiler: *gcc7,   std: [14],             project: *projects_common, os: 'ubuntu20.04'}
+  - {job_types: ['test'],  ctk: *ctk_curr,     gpu: 'l4',       cuda_compile_arch: '89-real', host_compiler: *gcc12,  std: [11, 14, 17, 20], project: *projects_common, os: 'ubuntu22.04'}
+  - {job_types: ['test'],  ctk: *ctk_curr,     gpu: 'rtx4090',  cuda_compile_arch: '89-real', host_compiler: *llvm9,  std: [11],             project: *projects_common, os: 'ubuntu20.04'}
+  - {job_types: ['test'],  ctk: *ctk_curr,     gpu: 'h100',     cuda_compile_arch: '90-real', host_compiler: *gcc12,  std: [11, 20],         project: *projects_common, os: 'ubuntu22.04'}
+  - {job_types: ['test'],  ctk: *ctk_curr,     gpu: 'h100',     cuda_compile_arch: '90-real', host_compiler: *llvm16, std: [17],             project: *projects_common, os: 'ubuntu22.04'}
+  - {job_types: ['nvrtc'], ctk: *ctk_curr,     gpu: 't4',       cuda_compile_arch: '75-real', host_compiler: *gcc12,  std: [20],             project: ['libcudacxx'],   os: 'ubuntu22.04'}
+  - {job_types: ['nvrtc'], ctk: *ctk_curr,     gpu: 'rtxa6000', cuda_compile_arch: '86-real', host_compiler: *gcc12,  std: [20],             project: ['libcudacxx'],   os: 'ubuntu22.04'}
+  - {job_types: ['nvrtc'], ctk: *ctk_curr,     gpu: 'l4',       cuda_compile_arch: '89-real', host_compiler: *gcc12,  std: [11, 14, 17, 20], project: ['libcudacxx'],   os: 'ubuntu22.04'}
+  - {job_types: ['nvrtc'], ctk: *ctk_curr,     gpu: 'h100',     cuda_compile_arch: '90-real', host_compiler: *gcc12,  std: [11, 20],         project: ['libcudacxx'],   os: 'ubuntu22.04'}