Skip to content

Commit

Permalink
Merge branch 'main' into __grid_constant__
Browse files Browse the repository at this point in the history
  • Loading branch information
fbusato authored Oct 10, 2024
2 parents fe0df81 + 87ef1d5 commit e9f1f64
Show file tree
Hide file tree
Showing 93 changed files with 5,544 additions and 2,724 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ if (CCCL_TOPLEVEL_PROJECT)
include(cmake/CCCLBuildCompilerTargets.cmake)
include(cmake/CCCLClangdCompileInfo.cmake)
include(cmake/CCCLConfigureTarget.cmake)
include(cmake/CCCLGenerateHeaderTests.cmake)
include(cmake/CCCLGetDependencies.cmake)

cccl_build_compiler_targets()
Expand Down
20 changes: 20 additions & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"CCCL_ENABLE_CUDAX": false,
"CCCL_ENABLE_TESTING": false,
"CCCL_ENABLE_EXAMPLES": false,
"CCCL_ENABLE_C": false,
"libcudacxx_ENABLE_INSTALL_RULES": true,
"CUB_ENABLE_INSTALL_RULES": true,
"Thrust_ENABLE_INSTALL_RULES": true,
Expand Down Expand Up @@ -314,6 +315,16 @@
"cudax_ENABLE_DIALECT_CPP20": true
}
},
{
"name": "cccl-c-parallel",
"displayName" : "CCCL C Parallel Library",
"inherits": "base",
"cacheVariables": {
"CCCL_ENABLE_C": true,
"CCCL_C_Parallel_ENABLE_TESTING": true,
"CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
}
},
{
"name": "cccl-infra",
"displayName": "CCCL Infrastructure",
Expand Down Expand Up @@ -443,6 +454,10 @@
"name": "cudax-cpp20",
"configurePreset": "cudax-cpp20"
},
{
"name": "cccl-c-parallel",
"configurePreset": "cccl-c-parallel"
},
{
"name": "cccl-infra",
"configurePreset": "cccl-infra"
Expand Down Expand Up @@ -808,6 +823,11 @@
"configurePreset": "cudax-cpp20",
"inherits": "cudax-base"
},
{
"name": "cccl-c-parallel",
"configurePreset": "cccl-c-parallel",
"inherits": "base"
},
{
"name": "cccl-infra",
"configurePreset": "cccl-infra",
Expand Down
21 changes: 1 addition & 20 deletions c/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,20 +1 @@
cmake_minimum_required(VERSION 3.30)

project(cccl.c LANGUAGES CUDA CXX)

add_library(cccl.c SHARED src/reduce.cu)
set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20)
set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20)

find_package(CUDAToolkit REQUIRED)

# TODO Use static versions of cudart, nvrtc, and nvJitLink
target_link_libraries(cccl.c PRIVATE CUDA::cudart
CUDA::nvrtc
CUDA::nvJitLink
CUDA::cuda_driver)
target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1)
target_include_directories(cccl.c PUBLIC "include")

add_subdirectory(test)
add_subdirectory(parallel)
53 changes: 53 additions & 0 deletions c/parallel/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
cmake_minimum_required(VERSION 3.21)

project(CCCL_C_Parallel LANGUAGES CUDA CXX)

option(CCCL_C_Parallel_ENABLE_TESTING "Build CUDA Experimental's tests." OFF)
option(CCCL_C_Parallel_ENABLE_HEADER_TESTING "Build CUDA Experimental's standalone headers." OFF)

# FIXME Ideally this would be handled by presets and install rules, but for now
# consumers may override this to control the target location of cccl.c.parallel.
set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.parallel library")
mark_as_advanced(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)

file(GLOB_RECURSE srcs
RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
CONFIGURE_DEPENDS
"src/*.cu" "src/*.cpp"
)

add_library(cccl.c.parallel SHARED ${srcs})
set_property(TARGET cccl.c.parallel PROPERTY POSITION_INDEPENDENT_CODE ON)
cccl_configure_target(cccl.c.parallel DIALECT 20)

# Override the properties set by cccl_configure_target:
if (CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)
set_target_properties(cccl.c.parallel PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}"
ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}"
)
endif()

find_package(CUDAToolkit REQUIRED)

# TODO Use static versions of cudart, nvrtc, and nvJitLink
target_link_libraries(cccl.c.parallel PRIVATE
CUDA::cudart
CUDA::nvrtc
CUDA::nvJitLink
CUDA::cuda_driver
cccl.compiler_interface_cpp20
)
target_compile_definitions(cccl.c.parallel PUBLIC CCCL_C_EXPERIMENTAL=1)
target_compile_definitions(cccl.c.parallel PRIVATE NVRTC_GET_TYPE_NAME=1)

target_include_directories(cccl.c.parallel PUBLIC "include")
target_include_directories(cccl.c.parallel PRIVATE "src")

if (CCCL_C_Parallel_ENABLE_TESTING)
add_subdirectory(test)
endif()

if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
include(cmake/CParallelHeaderTesting.cmake)
endif()
11 changes: 11 additions & 0 deletions c/parallel/cmake/CParallelHeaderTesting.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# For every public header, build a translation unit containing `#include <header>`
# to let the compiler try to figure out warnings in that header if it is not otherwise
# included in tests, and also to verify if the headers are modular enough.
# .inl files are not globbed for, because they are not supposed to be used as public
# entrypoints.

cccl_generate_header_tests(cccl.c.parallel.headers c/parallel/include
DIALECT 20
GLOBS "cccl/c/*.h"
)
target_link_libraries(cccl.c.parallel.headers PUBLIC cccl.c.parallel)
50 changes: 50 additions & 0 deletions c/parallel/include/cccl/c/for.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#pragma once

#ifndef CCCL_C_EXPERIMENTAL
# warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning."
#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv

# include <cuda.h>

# include <cccl/c/types.h>

struct cccl_device_for_build_result_t
{
int cc;
void* cubin;
size_t cubin_size;
CUlibrary library;
CUkernel static_kernel;
};

extern "C" CCCL_C_API CUresult cccl_device_for_build(
cccl_device_for_build_result_t* build,
cccl_iterator_t d_data,
cccl_op_t op,
int cc_major,
int cc_minor,
const char* cub_path,
const char* thrust_path,
const char* libcudacxx_path,
const char* ctk_path) noexcept;

extern "C" CCCL_C_API CUresult cccl_device_for(
cccl_device_for_build_result_t build,
cccl_iterator_t d_data,
int64_t num_items,
cccl_op_t op,
CUstream stream) noexcept;

extern "C" CCCL_C_API CUresult cccl_device_for_cleanup(cccl_device_for_build_result_t* bld_ptr);

#endif // CCCL_C_EXPERIMENTAL
File renamed without changes.
File renamed without changes.
190 changes: 190 additions & 0 deletions c/parallel/src/for.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <cub/detail/choose_offset.cuh>
#include <cub/grid/grid_even_share.cuh>
#include <cub/util_device.cuh>

#include <format>
#include <type_traits>

#include <cccl/c/for.h>
#include <cccl/c/types.h>
#include <for/for_op_helper.h>
#include <nvrtc/command_list.h>
#include <util/context.h>
#include <util/errors.h>
#include <util/types.h>

struct op_wrapper;
struct device_reduce_policy;

using OffsetT = unsigned long long;
static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be size_t");

static cudaError_t
Invoke(cccl_iterator_t d_in, size_t num_items, cccl_op_t op, int /*cc*/, CUfunction static_kernel, CUstream stream)
{
cudaError error = cudaSuccess;

if (num_items == 0)
{
return error;
}

auto for_kernel_state = make_for_kernel_state(op, d_in);

void* args[] = {&num_items, for_kernel_state.get()};

int thread_count = 256;
int block_count = (num_items + 511) / 512;
check(cuLaunchKernel(static_kernel, block_count, 1, 1, thread_count, 1, 1, 0, stream, args, 0));

// Check for failure to launch
error = CubDebug(cudaPeekAtLastError());

return error;
}

struct for_each_wrapper;

static std::string get_device_for_kernel_name()
{
std::string offset_t;
std::string function_op_t;
check(nvrtcGetTypeName<for_each_wrapper>(&function_op_t));
check(nvrtcGetTypeName<OffsetT>(&offset_t));

return std::format("cub::detail::for_each::static_kernel<device_for_policy, {0}, {1}>", offset_t, function_op_t);
}

extern "C" CCCL_C_API CUresult cccl_device_for_build(
cccl_device_for_build_result_t* build,
cccl_iterator_t d_data,
cccl_op_t op,
int cc_major,
int cc_minor,
const char* cub_path,
const char* thrust_path,
const char* libcudacxx_path,
const char* ctk_path) noexcept
{
CUresult error = CUDA_SUCCESS;

try
{
if (d_data.type == cccl_iterator_kind_t::iterator)
{
throw std::runtime_error(std::string("Iterators are unsupported in for_each currently"));
}

const char* name = "test";

const int cc = cc_major * 10 + cc_minor;
const std::string d_data_value_t = cccl_type_enum_to_string(d_data.value_type.type);
const std::string offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64);

const std::string for_kernel_name = get_device_for_kernel_name();
const std::string device_for_kernel = get_for_kernel(op, d_data);

const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);

constexpr size_t num_args = 7;
const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};

constexpr size_t num_lto_args = 2;
const char* lopts[num_lto_args] = {"-lto", arch.c_str()};

std::string lowered_name;

auto cl =
make_nvrtc_command_list()
.add_program(nvrtc_translation_unit{device_for_kernel, name})
.add_expression({for_kernel_name})
.compile_program({args, num_args})
.get_name({for_kernel_name, lowered_name})
.cleanup_program()
.add_link({op.ltoir, op.ltoir_size});

nvrtc_cubin result{};

if (cccl_iterator_kind_t::iterator == d_data.type)
{
result = cl.add_link({d_data.advance.ltoir, d_data.advance.ltoir_size})
.add_link({d_data.dereference.ltoir, d_data.dereference.ltoir_size})
.finalize_program(num_lto_args, lopts);
}
else
{
result = cl.finalize_program(num_lto_args, lopts);
}

cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
check(cuLibraryGetKernel(&build->static_kernel, build->library, lowered_name.c_str()));

build->cc = cc;
build->cubin = (void*) result.cubin.release();
build->cubin_size = result.size;
}
catch (...)
{
error = CUDA_ERROR_UNKNOWN;
}
return error;
}

extern "C" CCCL_C_API CUresult cccl_device_for(
cccl_device_for_build_result_t build,
cccl_iterator_t d_data,
int64_t num_items,
cccl_op_t op,
CUstream stream) noexcept
{
bool pushed = false;
CUresult error = CUDA_SUCCESS;

try
{
pushed = try_push_context();
Invoke(d_data, num_items, op, build.cc, (CUfunction) build.static_kernel, stream);
}
catch (...)
{
error = CUDA_ERROR_UNKNOWN;
}

if (pushed)
{
CUcontext dummy;
cuCtxPopCurrent(&dummy);
}

return error;
}

extern "C" CCCL_C_API CUresult cccl_device_for_cleanup(cccl_device_for_build_result_t* bld_ptr)
{
try
{
if (bld_ptr == nullptr)
{
return CUDA_ERROR_INVALID_VALUE;
}

std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(bld_ptr->cubin));
check(cuLibraryUnload(bld_ptr->library));
}
catch (...)
{
return CUDA_ERROR_UNKNOWN;
}

return CUDA_SUCCESS;
}
Loading

0 comments on commit e9f1f64

Please sign in to comment.