Merge branch 'main' into __grid_constant__

NVIDIA · Oct 10, 2024 · e9f1f64 · e9f1f64
2 parents fe0df81 + 87ef1d5
commit e9f1f64
Show file tree

Hide file tree

Showing 93 changed files with 5,544 additions and 2,724 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -58,6 +58,7 @@ if (CCCL_TOPLEVEL_PROJECT)
   include(cmake/CCCLBuildCompilerTargets.cmake)
   include(cmake/CCCLClangdCompileInfo.cmake)
   include(cmake/CCCLConfigureTarget.cmake)
+  include(cmake/CCCLGenerateHeaderTests.cmake)
   include(cmake/CCCLGetDependencies.cmake)
 
   cccl_build_compiler_targets()

diff --git a/CMakePresets.json b/CMakePresets.json
@@ -21,6 +21,7 @@
         "CCCL_ENABLE_CUDAX": false,
         "CCCL_ENABLE_TESTING": false,
         "CCCL_ENABLE_EXAMPLES": false,
+        "CCCL_ENABLE_C": false,
         "libcudacxx_ENABLE_INSTALL_RULES": true,
         "CUB_ENABLE_INSTALL_RULES": true,
         "Thrust_ENABLE_INSTALL_RULES": true,
@@ -314,6 +315,16 @@
         "cudax_ENABLE_DIALECT_CPP20": true
       }
     },
+    {
+      "name": "cccl-c-parallel",
+      "displayName" : "CCCL C Parallel Library",
+      "inherits": "base",
+      "cacheVariables": {
+        "CCCL_ENABLE_C": true,
+        "CCCL_C_Parallel_ENABLE_TESTING": true,
+        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
+      }
+    },
     {
       "name": "cccl-infra",
       "displayName": "CCCL Infrastructure",
@@ -443,6 +454,10 @@
       "name": "cudax-cpp20",
       "configurePreset": "cudax-cpp20"
     },
+    {
+      "name": "cccl-c-parallel",
+      "configurePreset": "cccl-c-parallel"
+    },
     {
       "name": "cccl-infra",
       "configurePreset": "cccl-infra"
@@ -808,6 +823,11 @@
       "configurePreset": "cudax-cpp20",
       "inherits": "cudax-base"
     },
+    {
+      "name": "cccl-c-parallel",
+      "configurePreset": "cccl-c-parallel",
+      "inherits": "base"
+    },
     {
       "name": "cccl-infra",
       "configurePreset": "cccl-infra",

diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
@@ -1,20 +1 @@
-cmake_minimum_required(VERSION 3.30)
-
-project(cccl.c LANGUAGES CUDA CXX)
-
-add_library(cccl.c SHARED src/reduce.cu)
-set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20)
-set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20)
-
-find_package(CUDAToolkit REQUIRED)
-
-# TODO Use static versions of cudart, nvrtc, and nvJitLink
-target_link_libraries(cccl.c PRIVATE CUDA::cudart
-                                     CUDA::nvrtc
-                                     CUDA::nvJitLink
-                                     CUDA::cuda_driver)
-target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1)
-target_include_directories(cccl.c PUBLIC "include")
-
-add_subdirectory(test)
+add_subdirectory(parallel)
diff --git a/c/parallel/CMakeLists.txt b/c/parallel/CMakeLists.txt
@@ -0,0 +1,53 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(CCCL_C_Parallel LANGUAGES CUDA CXX)
+
+option(CCCL_C_Parallel_ENABLE_TESTING "Build CUDA Experimental's tests." OFF)
+option(CCCL_C_Parallel_ENABLE_HEADER_TESTING "Build CUDA Experimental's standalone headers." OFF)
+
+# FIXME Ideally this would be handled by presets and install rules, but for now
+# consumers may override this to control the target location of cccl.c.parallel.
+set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.parallel library")
+mark_as_advanced(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)
+
+file(GLOB_RECURSE srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  "src/*.cu" "src/*.cpp"
+)
+
+add_library(cccl.c.parallel SHARED ${srcs})
+set_property(TARGET cccl.c.parallel PROPERTY POSITION_INDEPENDENT_CODE ON)
+cccl_configure_target(cccl.c.parallel DIALECT 20)
+
+# Override the properties set by cccl_configure_target:
+if (CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)
+  set_target_properties(cccl.c.parallel PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}"
+    ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}"
+  )
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# TODO Use static versions of cudart, nvrtc, and nvJitLink
+target_link_libraries(cccl.c.parallel PRIVATE
+  CUDA::cudart
+  CUDA::nvrtc
+  CUDA::nvJitLink
+  CUDA::cuda_driver
+  cccl.compiler_interface_cpp20
+)
+target_compile_definitions(cccl.c.parallel PUBLIC CCCL_C_EXPERIMENTAL=1)
+target_compile_definitions(cccl.c.parallel PRIVATE NVRTC_GET_TYPE_NAME=1)
+
+target_include_directories(cccl.c.parallel PUBLIC "include")
+target_include_directories(cccl.c.parallel PRIVATE "src")
+
+if (CCCL_C_Parallel_ENABLE_TESTING)
+  add_subdirectory(test)
+endif()
+
+if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
+  include(cmake/CParallelHeaderTesting.cmake)
+endif()
diff --git a/c/parallel/cmake/CParallelHeaderTesting.cmake b/c/parallel/cmake/CParallelHeaderTesting.cmake
@@ -0,0 +1,11 @@
+# For every public header, build a translation unit containing `#include <header>`
+# to let the compiler try to figure out warnings in that header if it is not otherwise
+# included in tests, and also to verify if the headers are modular enough.
+# .inl files are not globbed for, because they are not supposed to be used as public
+# entrypoints.
+
+cccl_generate_header_tests(cccl.c.parallel.headers c/parallel/include
+  DIALECT 20
+  GLOBS "cccl/c/*.h"
+)
+target_link_libraries(cccl.c.parallel.headers PUBLIC cccl.c.parallel)
diff --git a/c/parallel/include/cccl/c/for.h b/c/parallel/include/cccl/c/for.h
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning."
+#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv
+
+#  include <cuda.h>
+
+#  include <cccl/c/types.h>
+
+struct cccl_device_for_build_result_t
+{
+  int cc;
+  void* cubin;
+  size_t cubin_size;
+  CUlibrary library;
+  CUkernel static_kernel;
+};
+
+extern "C" CCCL_C_API CUresult cccl_device_for_build(
+  cccl_device_for_build_result_t* build,
+  cccl_iterator_t d_data,
+  cccl_op_t op,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_for(
+  cccl_device_for_build_result_t build,
+  cccl_iterator_t d_data,
+  int64_t num_items,
+  cccl_op_t op,
+  CUstream stream) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_for_cleanup(cccl_device_for_build_result_t* bld_ptr);
+
+#endif // CCCL_C_EXPERIMENTAL
diff --git a/c/include/cccl/c/reduce.h → c/parallel/include/cccl/c/reduce.h b/c/include/cccl/c/reduce.h → c/parallel/include/cccl/c/reduce.h
diff --git a/c/include/cccl/c/types.h → c/parallel/include/cccl/c/types.h b/c/include/cccl/c/types.h → c/parallel/include/cccl/c/types.h
diff --git a/c/parallel/src/for.cu b/c/parallel/src/for.cu
@@ -0,0 +1,190 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/util_device.cuh>
+
+#include <format>
+#include <type_traits>
+
+#include <cccl/c/for.h>
+#include <cccl/c/types.h>
+#include <for/for_op_helper.h>
+#include <nvrtc/command_list.h>
+#include <util/context.h>
+#include <util/errors.h>
+#include <util/types.h>
+
+struct op_wrapper;
+struct device_reduce_policy;
+
+using OffsetT = unsigned long long;
+static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be size_t");
+
+static cudaError_t
+Invoke(cccl_iterator_t d_in, size_t num_items, cccl_op_t op, int /*cc*/, CUfunction static_kernel, CUstream stream)
+{
+  cudaError error = cudaSuccess;
+
+  if (num_items == 0)
+  {
+    return error;
+  }
+
+  auto for_kernel_state = make_for_kernel_state(op, d_in);
+
+  void* args[] = {&num_items, for_kernel_state.get()};
+
+  int thread_count = 256;
+  int block_count  = (num_items + 511) / 512;
+  check(cuLaunchKernel(static_kernel, block_count, 1, 1, thread_count, 1, 1, 0, stream, args, 0));
+
+  // Check for failure to launch
+  error = CubDebug(cudaPeekAtLastError());
+
+  return error;
+}
+
+struct for_each_wrapper;
+
+static std::string get_device_for_kernel_name()
+{
+  std::string offset_t;
+  std::string function_op_t;
+  check(nvrtcGetTypeName<for_each_wrapper>(&function_op_t));
+  check(nvrtcGetTypeName<OffsetT>(&offset_t));
+
+  return std::format("cub::detail::for_each::static_kernel<device_for_policy, {0}, {1}>", offset_t, function_op_t);
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_for_build(
+  cccl_device_for_build_result_t* build,
+  cccl_iterator_t d_data,
+  cccl_op_t op,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept
+{
+  CUresult error = CUDA_SUCCESS;
+
+  try
+  {
+    if (d_data.type == cccl_iterator_kind_t::iterator)
+    {
+      throw std::runtime_error(std::string("Iterators are unsupported in for_each currently"));
+    }
+
+    const char* name = "test";
+
+    const int cc                     = cc_major * 10 + cc_minor;
+    const std::string d_data_value_t = cccl_type_enum_to_string(d_data.value_type.type);
+    const std::string offset_t       = cccl_type_enum_to_string(cccl_type_enum::UINT64);
+
+    const std::string for_kernel_name   = get_device_for_kernel_name();
+    const std::string device_for_kernel = get_for_kernel(op, d_data);
+
+    const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
+
+    constexpr size_t num_args  = 7;
+    const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
+
+    constexpr size_t num_lto_args   = 2;
+    const char* lopts[num_lto_args] = {"-lto", arch.c_str()};
+
+    std::string lowered_name;
+
+    auto cl =
+      make_nvrtc_command_list()
+        .add_program(nvrtc_translation_unit{device_for_kernel, name})
+        .add_expression({for_kernel_name})
+        .compile_program({args, num_args})
+        .get_name({for_kernel_name, lowered_name})
+        .cleanup_program()
+        .add_link({op.ltoir, op.ltoir_size});
+
+    nvrtc_cubin result{};
+
+    if (cccl_iterator_kind_t::iterator == d_data.type)
+    {
+      result = cl.add_link({d_data.advance.ltoir, d_data.advance.ltoir_size})
+                 .add_link({d_data.dereference.ltoir, d_data.dereference.ltoir_size})
+                 .finalize_program(num_lto_args, lopts);
+    }
+    else
+    {
+      result = cl.finalize_program(num_lto_args, lopts);
+    }
+
+    cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    check(cuLibraryGetKernel(&build->static_kernel, build->library, lowered_name.c_str()));
+
+    build->cc         = cc;
+    build->cubin      = (void*) result.cubin.release();
+    build->cubin_size = result.size;
+  }
+  catch (...)
+  {
+    error = CUDA_ERROR_UNKNOWN;
+  }
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_for(
+  cccl_device_for_build_result_t build,
+  cccl_iterator_t d_data,
+  int64_t num_items,
+  cccl_op_t op,
+  CUstream stream) noexcept
+{
+  bool pushed    = false;
+  CUresult error = CUDA_SUCCESS;
+
+  try
+  {
+    pushed = try_push_context();
+    Invoke(d_data, num_items, op, build.cc, (CUfunction) build.static_kernel, stream);
+  }
+  catch (...)
+  {
+    error = CUDA_ERROR_UNKNOWN;
+  }
+
+  if (pushed)
+  {
+    CUcontext dummy;
+    cuCtxPopCurrent(&dummy);
+  }
+
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_for_cleanup(cccl_device_for_build_result_t* bld_ptr)
+{
+  try
+  {
+    if (bld_ptr == nullptr)
+    {
+      return CUDA_ERROR_INVALID_VALUE;
+    }
+
+    std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(bld_ptr->cubin));
+    check(cuLibraryUnload(bld_ptr->library));
+  }
+  catch (...)
+  {
+    return CUDA_ERROR_UNKNOWN;
+  }
+
+  return CUDA_SUCCESS;
+}