NVIDIA · alliepiper · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -62,6 +62,10 @@
         "THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB": true,
         "cudax_ENABLE_HEADER_TESTING": true,
         "cudax_ENABLE_TESTING": true,
+        "cudax_ENABLE_EXAMPLES": true,
+        "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
+        "cudax_ENABLE_CUDASTF_DEBUG": false,
+        "cudax_ENABLE_CUDASTF_MATHLIBS": false,
         "cudax_ENABLE_DIALECT_CPP17": true,
         "cudax_ENABLE_DIALECT_CPP20": true
       }
@@ -71,9 +75,11 @@
       "displayName": "all-dev debug",
       "inherits": "all-dev",
       "cacheVariables": {
-        "CCCL_ENABLE_BENCHMARKS": false,
         "CMAKE_BUILD_TYPE": "Debug",
-        "CMAKE_CUDA_FLAGS": "-G"
+        "CMAKE_CUDA_FLAGS": "-G",
+        "CCCL_ENABLE_BENCHMARKS": false,
+        "cudax_ENABLE_CUDASTF_BOUNDSCHECK": true,
+        "cudax_ENABLE_CUDASTF_DEBUG": true
       }
     },
     {
@@ -295,6 +301,9 @@
         "CCCL_ENABLE_CUDAX": true,
         "cudax_ENABLE_HEADER_TESTING": true,
         "cudax_ENABLE_TESTING": true,
+        "cudax_ENABLE_EXAMPLES": true,
+        "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
+        "cudax_ENABLE_CUDASTF_MATHLIBS": false,
         "cudax_ENABLE_DIALECT_CPP17": false,
         "cudax_ENABLE_DIALECT_CPP20": false
       }

@@ -8,6 +8,18 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'llvm16']}
   #
   override:
+    - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc9', 'gcc10', 'gcc11']}
+    - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13']}
+    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
+    - {jobs: ['build'], project: 'cudax', ctk: ['12.0',       ], std: 20,    cxx: ['msvc14.36']}
+    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 20,    cxx: ['msvc2022']}
+    - {jobs: ['build'], project: 'cudax', ctk: ['12.0'        ], std: 17,    cxx: ['gcc12'], sm: "90"}
+    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 17,    cxx: ['gcc13'], sm: "90a"}
+    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'min', cxx: ['gcc12']}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc12']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'max', cxx: ['clang14']}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'max', cxx: ['clang18']}
 
   pull_request:
     # Old CTK

@@ -16,9 +16,19 @@ endif()
 
 option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON)
 option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
+option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's tests." ON)
+option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF)
+option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF)
+option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF)
+
+if ((cudax_ENABLE_CUDASTF_BOUNDSCHECK OR cudax_ENABLE_CUDASTF_DEBUG) AND
+    NOT CMAKE_BUILD_TYPE MATCHES "Debug" AND NOT CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo")
+  message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK and cudax_ENABLE_CUDASTF_DEBUG require a Debug build.")
+endif()
 
 include(cmake/cudaxBuildCompilerTargets.cmake)
 include(cmake/cudaxBuildTargetList.cmake)
+include(cmake/cudaxSTFConfigureTarget.cmake)
 
 cudax_build_compiler_targets()
 cudax_build_target_list()
@@ -28,7 +38,9 @@ if (cudax_ENABLE_HEADER_TESTING)
 endif()
 
 if (cudax_ENABLE_TESTING)
-  include(CTest)
-  enable_testing() # Must be in root directory
   add_subdirectory(test)
 endif()
+
+if (cudax_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif()
@@ -12,7 +12,9 @@ function(cudax_add_header_test label definitions)
     cudax_get_target_property(config_dialect ${cn_target} DIALECT)
     cudax_get_target_property(config_prefix ${cn_target} PREFIX)
 
-    set(headertest_target ${config_prefix}.headers.${label})
+    ###################
+    # Non-STF headers #
+    set(headertest_target ${config_prefix}.headers.${label}.no_stf)
     cccl_generate_header_tests(${headertest_target} cudax/include
       DIALECT ${config_dialect}
       # The cudax header template removes the check for the `small` macro.
@@ -22,6 +24,9 @@ function(cudax_add_header_test label definitions)
         # The following internal headers are not required to compile independently:
         "cuda/experimental/__async/prologue.cuh"
         "cuda/experimental/__async/epilogue.cuh"
+        # STF headers are compiled separately:
+        "cuda/experimental/stf.cuh"
+        "cuda/experimental/__stf/*"
     )
     target_link_libraries(${headertest_target} PUBLIC ${cn_target})
     target_compile_definitions(${headertest_target} PRIVATE
@@ -32,6 +37,47 @@ function(cudax_add_header_test label definitions)
 
     add_dependencies(cudax.all.headers ${headertest_target})
     add_dependencies(${config_prefix}.all ${headertest_target})
+
+    ###############
+    # STF headers #
+    set(headertest_target ${config_prefix}.headers.${label}.stf)
+    cccl_generate_header_tests(${headertest_target} cudax/include
+      DIALECT ${config_dialect}
+      GLOBS
+        "cuda/experimental/stf.cuh"
+        "cuda/experimental/__stf/*.cuh"
+
+      # FIXME: The cudax header template removes the check for the `small` macro.
+      # cuda/experimental/__stf/utility/memory.cuh defines functions named `small`.
+      # These should be renamed to avoid conflicts with windows system headers, and
+      # the following line removed:
+      HEADER_TEMPLATE "${cudax_SOURCE_DIR}/cmake/header_test.in.cu"
+
+      EXCLUDES
+        # FIXME: layout_left::mapping referenced before layout_left:
+        cuda/experimental/__stf/supplemental_std_experimental/__p0009_bits/layout_left.hpp
+
+        # # FIXME: error: possibly dangling reference to a temporary (stream_task.cuh:114)
+        # cuda/experimental/__stf/stream/stream_task.cuh
+        # cuda/experimental/__stf/stream/stream_ctx.cuh
+    )
+    target_link_libraries(${headertest_target} PUBLIC ${cn_target})
+    target_compile_definitions(${headertest_target} PRIVATE
+      ${definitions}
+      "-DCUDASTF_PROVIDE_MDSPAN"
+    )
+    target_compile_options(${headertest_target} PRIVATE
+      # Required by stf headers:
+      $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
+      # FIXME: We should be able to refactor away from needing this by
+      # using _CCCL_HOST_DEVICE and friends + `::cuda::std` utilities where
+      # necessary.
+      $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>
+    )
+    cudax_clone_target_properties(${headertest_target} ${cn_target})
+
+    add_dependencies(cudax.all.headers ${headertest_target})
+    add_dependencies(${config_prefix}.all ${headertest_target})
   endforeach()
 endfunction()
 

@@ -0,0 +1,44 @@
+# Configures a target for the STF framework.
+function(cudax_stf_configure_target target_name)
+  set(options LINK_MATHLIBS)
+  set(oneValueArgs)
+  set(multiValueArgs)
+  cmake_parse_arguments(CSCT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  target_link_libraries(${target_name} PRIVATE
+    ${cn_target}
+    CUDA::cudart
+    CUDA::curand
+    CUDA::cuda_driver
+  )
+  target_compile_options(${target_name} PRIVATE
+    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
+    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>
+  )
+  target_compile_definitions(${target_name} PRIVATE
+    "CUDASTF_PROVIDE_MDSPAN"
+  )
+  set_target_properties(${target_name} PROPERTIES
+    CUDA_RUNTIME_LIBRARY Static
+    CUDA_SEPARABLE_COMPILATION ON
+  )
+
+  if (CSCT_LINK_MATHLIBS)
+    target_link_libraries(${target_name} PRIVATE
+      CUDA::cublas
+      CUDA::cusolver
+    )
+  endif()
+
+  if (cudax_ENABLE_CUDASTF_BOUNDSCHECK)
+    target_compile_definitions(${target_name} PRIVATE
+      "CUDASTF_BOUNDSCHECK"
+    )
+  endif()
+
+  if (cudax_ENABLE_CUDASTF_DEBUG)
+    target_compile_definitions(${target_name} PRIVATE
+      "CUDASTF_DEBUG"
+    )
+  endif()
+endfunction()
@@ -0,0 +1,9 @@
+// This file is autogenerated by configuring stf_header_unittest.in.cu.
+
+// clang-format off
+#define UNITTESTED_FILE "@source@"
+
+#include <cuda/experimental/__stf/utility/unittest.cuh>
+
+#include <@source@>
+//clang-format on
@@ -0,0 +1,10 @@
+foreach(cn_target IN LISTS cudax_TARGETS)
+  cudax_get_target_property(config_prefix ${cn_target} PREFIX)
+
+  # Metatarget for the current configuration's tests:
+  set(config_meta_target ${config_prefix}.examples)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+add_subdirectory(stf)
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief An AXPY kernel described using a cuda_kernel construct
+ *
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+__global__ void axpy(double a, slice<const double> x, slice<double> y) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int nthreads = gridDim.x * blockDim.x;
+
+    for (int i = tid; i < x.size(); i += nthreads) {
+        y(i) += a * x(i);
+    }
+}
+
+double X0(int i) {
+    return sin((double) i);
+}
+
+double Y0(int i) {
+    return cos((double) i);
+}
+
+int main() {
+    context ctx = graph_ctx();
+    const size_t N = 16;
+    double X[N], Y[N];
+
+    for (size_t i = 0; i < N; i++) {
+        X[i] = X0(i);
+        Y[i] = Y0(i);
+    }
+
+    double alpha = 3.14;
+
+    auto lX = ctx.logical_data(X);
+    auto lY = ctx.logical_data(Y);
+
+    /* Compute Y = Y + alpha X */
+    ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
+        // axpy<<<16, 128, 0, ...>>>(alpha, dX, dY)
+        return cuda_kernel_desc { axpy, 16, 128, 0, alpha, dX, dY };
+    };
+
+    ctx.finalize();
+
+    for (size_t i = 0; i < N; i++) {
+        assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001);
+        assert(fabs(X[i] - X0(i)) < 0.0001);
+    }
+}
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Example of task implementing a chain of CUDA kernels
+ *
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+__global__ void axpy(double a, slice<const double> x, slice<double> y) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int nthreads = gridDim.x * blockDim.x;
+
+    for (int i = tid; i < x.size(); i += nthreads) {
+        y(i) += a * x(i);
+    }
+}
+
+double X0(int i) {
+    return sin((double) i);
+}
+
+double Y0(int i) {
+    return cos((double) i);
+}
+
+int main() {
+    context ctx = graph_ctx();
+    const size_t N = 16;
+    double X[N], Y[N];
+
+    for (size_t i = 0; i < N; i++) {
+        X[i] = X0(i);
+        Y[i] = Y0(i);
+    }
+
+    double alpha = 3.14;
+    double beta = 4.5;
+    double gamma = -4.1;
+
+    auto lX = ctx.logical_data(X);
+    auto lY = ctx.logical_data(Y);
+
+    /* Compute Y = Y + alpha X, Y = Y + beta X and then  Y = Y + gamma X */
+    ctx.cuda_kernel_chain(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
+        // clang-format off
+        return std::vector<cuda_kernel_desc> {
+            { axpy, 16, 128, 0, alpha, dX, dY },
+            { axpy, 16, 128, 0, beta, dX, dY },
+            { axpy, 16, 128, 0, gamma, dX, dY }
+        };
+        // clang-format on
+    };
+
+    ctx.finalize();
+
+    for (size_t i = 0; i < N; i++) {
+        assert(fabs(Y[i] - (Y0(i) + (alpha + beta + gamma) * X0(i))) < 0.0001);
+        assert(fabs(X[i] - X0(i)) < 0.0001);
+    }
+}