Skip to content

Commit

Permalink
Integrate c/parallel with CCCL build system and CI. (#2514)
Browse files Browse the repository at this point in the history
Integrate c/parallel into CCCL, setup CI, etc.
  • Loading branch information
alliepiper authored Oct 9, 2024
1 parent 951c822 commit e149e86
Show file tree
Hide file tree
Showing 29 changed files with 177 additions and 64 deletions.
20 changes: 20 additions & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"CCCL_ENABLE_CUDAX": false,
"CCCL_ENABLE_TESTING": false,
"CCCL_ENABLE_EXAMPLES": false,
"CCCL_ENABLE_C": false,
"libcudacxx_ENABLE_INSTALL_RULES": true,
"CUB_ENABLE_INSTALL_RULES": true,
"Thrust_ENABLE_INSTALL_RULES": true,
Expand Down Expand Up @@ -314,6 +315,16 @@
"cudax_ENABLE_DIALECT_CPP20": true
}
},
{
"name": "cccl-c-parallel",
"displayName" : "CCCL C Parallel Library",
"inherits": "base",
"cacheVariables": {
"CCCL_ENABLE_C": true,
"CCCL_C_Parallel_ENABLE_TESTING": true,
"CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
}
},
{
"name": "cccl-infra",
"displayName": "CCCL Infrastructure",
Expand Down Expand Up @@ -443,6 +454,10 @@
"name": "cudax-cpp20",
"configurePreset": "cudax-cpp20"
},
{
"name": "cccl-c-parallel",
"configurePreset": "cccl-c-parallel"
},
{
"name": "cccl-infra",
"configurePreset": "cccl-infra"
Expand Down Expand Up @@ -808,6 +823,11 @@
"configurePreset": "cudax-cpp20",
"inherits": "cudax-base"
},
{
"name": "cccl-c-parallel",
"configurePreset": "cccl-c-parallel",
"inherits": "base"
},
{
"name": "cccl-infra",
"configurePreset": "cccl-infra",
Expand Down
26 changes: 1 addition & 25 deletions c/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1 @@
cmake_minimum_required(VERSION 3.30)

project(cccl.c LANGUAGES CUDA CXX)

add_library(cccl.c SHARED
src/reduce.cu src/for.cu
src/for/for_op_helper.cpp
src/util/errors.cpp src/util/types.cpp src/util/context.cpp)

set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20)
set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20)

find_package(CUDAToolkit REQUIRED)

# TODO Use static versions of cudart, nvrtc, and nvJitLink
target_link_libraries(cccl.c PRIVATE CUDA::cudart
CUDA::nvrtc
CUDA::nvJitLink
CUDA::cuda_driver)
target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1)
target_include_directories(cccl.c PUBLIC "include")
target_include_directories(cccl.c PRIVATE "src")

add_subdirectory(test)
add_subdirectory(parallel)
53 changes: 53 additions & 0 deletions c/parallel/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
cmake_minimum_required(VERSION 3.21)

project(CCCL_C_Parallel LANGUAGES CUDA CXX)

option(CCCL_C_Parallel_ENABLE_TESTING "Build CUDA Experimental's tests." OFF)
option(CCCL_C_Parallel_ENABLE_HEADER_TESTING "Build CUDA Experimental's standalone headers." OFF)

# FIXME Ideally this would be handled by presets and install rules, but for now
# consumers may override this to control the target location of cccl.c.parallel.
set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.parallel library")
mark_as_advanced(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)

file(GLOB_RECURSE srcs
RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
CONFIGURE_DEPENDS
"src/*.cu" "src/*.cpp"
)

add_library(cccl.c.parallel SHARED ${srcs})
set_property(TARGET cccl.c.parallel PROPERTY POSITION_INDEPENDENT_CODE ON)
cccl_configure_target(cccl.c.parallel DIALECT 20)

# Override the properties set by cccl_configure_target:
if (CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)
set_target_properties(cccl.c.parallel PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}"
ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}"
)
endif()

find_package(CUDAToolkit REQUIRED)

# TODO Use static versions of cudart, nvrtc, and nvJitLink
target_link_libraries(cccl.c.parallel PRIVATE
CUDA::cudart
CUDA::nvrtc
CUDA::nvJitLink
CUDA::cuda_driver
cccl.compiler_interface_cpp20
)
target_compile_definitions(cccl.c.parallel PUBLIC CCCL_C_EXPERIMENTAL=1)
target_compile_definitions(cccl.c.parallel PRIVATE NVRTC_GET_TYPE_NAME=1)

target_include_directories(cccl.c.parallel PUBLIC "include")
target_include_directories(cccl.c.parallel PRIVATE "src")

if (CCCL_C_Parallel_ENABLE_TESTING)
add_subdirectory(test)
endif()

if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
include(cmake/CParallelHeaderTesting.cmake)
endif()
11 changes: 11 additions & 0 deletions c/parallel/cmake/CParallelHeaderTesting.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# For every public header, build a translation unit containing `#include <header>`
# to let the compiler try to figure out warnings in that header if it is not otherwise
# included in tests, and also to verify if the headers are modular enough.
# .inl files are not globbed for, because they are not supposed to be used as public
# entrypoints.

cccl_generate_header_tests(cccl.c.parallel.headers c/parallel/include
DIALECT 20
GLOBS "cccl/c/*.h"
)
target_link_libraries(cccl.c.parallel.headers PUBLIC cccl.c.parallel)
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion c/src/for.cu → c/parallel/src/for.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ using OffsetT = unsigned long long;
static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be size_t");

static cudaError_t
Invoke(cccl_iterator_t d_in, size_t num_items, cccl_op_t op, int cc, CUfunction static_kernel, CUstream stream)
Invoke(cccl_iterator_t d_in, size_t num_items, cccl_op_t op, int /*cc*/, CUfunction static_kernel, CUstream stream)
{
cudaError error = cudaSuccess;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ for_each_kernel_state make_for_kernel_state(cccl_op_t op, cccl_iterator_t iterat
{
// Iterator is either a pointer or a stateful object, allocate space according to its size or alignment
size_t iter_size = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.size : sizeof(void*);
size_t iter_align = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.alignment : alignof(void*);
void* iterator_state = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.state : &iterator.state;

// Do we need to valid user input? Alignments larger than the provided size?
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions c/src/reduce.cu → c/parallel/src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ static reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N])
return tunings[N - 1];
}

static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info input_type)
static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info /*input_type*/)
{
reduce_tuning_t chain[] = {{60, 256, 16, 4}, {35, 256, 20, 4}};

Expand All @@ -77,7 +77,7 @@ static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type,
return {block_size, items_per_thread, vector_load_length};
}

static cccl_type_info get_accumulator_type(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init)
static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init)
{
// TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
// so switching back to the old accumulator type logic for now
Expand Down Expand Up @@ -254,7 +254,7 @@ static cudaError_t Invoke(
runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type);

// Force kernel code-generation in all compiler passes
if (num_items <= (policy.block_size * policy.items_per_thread))
if (num_items <= static_cast<OffsetT>(policy.block_size * policy.items_per_thread))
{
// Small, single tile size
return InvokeSingleTile(
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
40 changes: 40 additions & 0 deletions c/parallel/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
cccl_get_catch2()

function(cccl_c_parallel_add_test target_name_var source)
string(REGEX REPLACE "test_([^.]*)" "cccl.c.parallel.test.\\1" target_name "${source}")
set(target_name_var ${target_name} PARENT_SCOPE)

add_executable(${target_name}
"${source}"
test_main.cpp
)
cccl_configure_target(${target_name} DIALECT 20)

target_link_libraries(${target_name} PRIVATE
cccl.c.parallel
CUDA::cudart
CUDA::nvrtc
Catch2::Catch2
cccl.compiler_interface_cpp20
)

target_compile_definitions(${target_name} PRIVATE
TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub"
TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}"
)

add_test(NAME ${target_name} COMMAND ${target_name})
endfunction()

file(GLOB test_srcs
RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
CONFIGURE_DEPENDS
*.cu *.cpp
)
list(REMOVE_ITEM test_srcs test_main.cpp)

foreach(test_src IN LISTS test_srcs)
cccl_c_parallel_add_test(test_target "${test_src}")
endforeach()
File renamed without changes.
File renamed without changes.
File renamed without changes.
13 changes: 7 additions & 6 deletions c/test/test_reduce.cpp → c/parallel/test/test_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ void reduce(cccl_iterator_t input, cccl_iterator_t output, unsigned long long nu
using integral_types = std::tuple<int32_t, uint32_t, int64_t, uint64_t>;
TEMPLATE_LIST_TEST_CASE("Reduce works with integral types", "[reduce]", integral_types)
{
const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
operation_t op = make_operation("op", get_reduce_op(get_type_info<TestType>().type));
const std::vector<TestType> input = generate<TestType>(num_items);
pointer_t<TestType> input_ptr(input);
Expand All @@ -70,7 +70,7 @@ struct pair

TEST_CASE("Reduce works with custom types", "[reduce]")
{
const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));

operation_t op = make_operation(
"op",
Expand Down Expand Up @@ -204,8 +204,9 @@ TEST_CASE("Reduce works with input and output iterators", "[reduce]")

TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]")
{
const int num_items = 1 << 14; // 16384 > 128
operation_t op = make_operation("op", get_reduce_op(get_type_info<size_t>().type));
const std::size_t num_items = 1 << 14; // 16384 > 128

operation_t op = make_operation("op", get_reduce_op(get_type_info<size_t>().type));
iterator_t<char, constant_iterator_state_t<char>> input_it = make_iterator<char, constant_iterator_state_t<char>>(
"struct constant_iterator_state_t { char value; };\n",
{"in_advance",
Expand All @@ -221,8 +222,8 @@ TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]")

reduce(input_it, output_it, num_items, op, init);

const size_t output = output_it[0];
const int expected = init.value + num_items;
const size_t output = output_it[0];
const size_t expected = init.value + num_items;
REQUIRE(output == expected);
}

Expand Down
17 changes: 0 additions & 17 deletions c/test/CMakeLists.txt

This file was deleted.

15 changes: 15 additions & 0 deletions ci/build_cccl_c_parallel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

set -euo pipefail

source "$(dirname "$0")/build_common.sh"

print_environment_details

PRESET="cccl-c-parallel"

CMAKE_OPTIONS=""

configure_and_build_preset "CCCL C Parallel Library" "$PRESET" "$CMAKE_OPTIONS"

print_time_summary
9 changes: 5 additions & 4 deletions ci/inspect_changes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ subprojects=(
thrust
cudax
pycuda
c
cccl_c_parallel
)

# ...and their dependencies:
Expand All @@ -37,8 +37,8 @@ declare -A dependencies=(
[cub]="cccl libcudacxx thrust"
[thrust]="cccl libcudacxx cub"
[cudax]="cccl libcudacxx"
[pycuda]="cccl libcudacxx cub thrust c"
[c]="cccl libcudacxx cub"
[pycuda]="cccl libcudacxx cub thrust cccl_c_parallel"
[cccl_c_parallel]="cccl libcudacxx cub thrust"
)

declare -A project_names=(
Expand All @@ -48,14 +48,15 @@ declare -A project_names=(
[thrust]="Thrust"
[cudax]="CUDA Experimental"
[pycuda]="pycuda"
[c]="CUDA C Core Library "
[cccl_c_parallel]="CCCL C Parallel Library"
)

# By default, the project directory is assumed to be the same as the subproject name,
# but can be overridden here. The `cccl` project is special, and checks for files outside
# of any subproject directory.
declare -A project_dirs=(
[pycuda]="python/cuda_cooperative"
[cccl_c_parallel]="c/parallel"
)

# Usage checks:
Expand Down
7 changes: 5 additions & 2 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ workflows:
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12']}
- {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'max', cxx: ['clang14']}
- {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'max', cxx: ['clang18']}
# Python jobs:
- {jobs: ['test'], project: 'pycuda', ctk: ['12.5']}
# Python and c/parallel jobs:
- {jobs: ['test'], project: ['cccl_c_parallel', 'pycuda'], ctk: '12.5'}
# cccl-infra:
- {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']}
- {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']}
Expand Down Expand Up @@ -233,6 +233,9 @@ projects:
pycuda:
name: "cuda (python)"
job_map: { build: [], test: ['test_nobuild'] }
cccl_c_parallel:
name: 'CCCL C Parallel'
stds: [20]

# testing -> Runner with GPU is in a nv-gh-runners testing pool
gpus:
Expand Down
13 changes: 13 additions & 0 deletions ci/test_cccl_c_parallel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

source "$(dirname "$0")/build_common.sh"

print_environment_details

./build_cccl_c_parallel.sh "$@"

PRESET="cccl-c-parallel"

test_preset "CCCL C Parallel Library" ${PRESET}

print_time_summary
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def _get_bindings():
if _bindings is None:
include_path = importlib.resources.files(
'cuda.parallel.experimental').joinpath('cccl')
cccl_c_path = os.path.join(include_path, 'libcccl.c.so')
cccl_c_path = os.path.join(include_path, 'libcccl.c.parallel.so')
_bindings = ctypes.CDLL(cccl_c_path)
_bindings.cccl_device_reduce.restype = ctypes.c_int
_bindings.cccl_device_reduce.restype = ctypes.c_int
Expand Down
Loading

0 comments on commit e149e86

Please sign in to comment.