From e149e8623792a915d2b4fe63590c4794b0b1e271 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Wed, 9 Oct 2024 14:18:03 -0400 Subject: [PATCH] Integrate c/parallel with CCCL build system and CI. (#2514) Integrate c/parallel into CCCL, setup CI, etc. --- CMakePresets.json | 20 +++++++ c/CMakeLists.txt | 26 +-------- c/parallel/CMakeLists.txt | 53 +++++++++++++++++++ c/parallel/cmake/CParallelHeaderTesting.cmake | 11 ++++ c/{ => parallel}/include/cccl/c/for.h | 0 c/{ => parallel}/include/cccl/c/reduce.h | 0 c/{ => parallel}/include/cccl/c/types.h | 0 c/{ => parallel}/src/for.cu | 2 +- c/{ => parallel}/src/for/for_op_helper.cpp | 1 - c/{ => parallel}/src/for/for_op_helper.h | 0 c/{ => parallel}/src/reduce.cu | 6 +-- c/{ => parallel}/src/util/context.cpp | 0 c/{ => parallel}/src/util/context.h | 0 c/{ => parallel}/src/util/errors.cpp | 0 c/{ => parallel}/src/util/errors.h | 0 c/{ => parallel}/src/util/types.cpp | 0 c/{ => parallel}/src/util/types.h | 0 c/parallel/test/CMakeLists.txt | 40 ++++++++++++++ c/{ => parallel}/test/c2h.h | 0 c/{ => parallel}/test/test_for.cpp | 0 c/{ => parallel}/test/test_main.cpp | 0 c/{ => parallel}/test/test_reduce.cpp | 13 ++--- c/test/CMakeLists.txt | 17 ------ ci/build_cccl_c_parallel.sh | 15 ++++++ ci/inspect_changes.sh | 9 ++-- ci/matrix.yaml | 7 ++- ci/test_cccl_c_parallel.sh | 13 +++++ .../cuda/parallel/experimental/__init__.py | 2 +- python/cuda_parallel/setup.py | 6 +-- 29 files changed, 177 insertions(+), 64 deletions(-) create mode 100644 c/parallel/CMakeLists.txt create mode 100644 c/parallel/cmake/CParallelHeaderTesting.cmake rename c/{ => parallel}/include/cccl/c/for.h (100%) rename c/{ => parallel}/include/cccl/c/reduce.h (100%) rename c/{ => parallel}/include/cccl/c/types.h (100%) rename c/{ => parallel}/src/for.cu (99%) rename c/{ => parallel}/src/for/for_op_helper.cpp (98%) rename c/{ => parallel}/src/for/for_op_helper.h (100%) rename c/{ => parallel}/src/reduce.cu (98%) rename c/{ => parallel}/src/util/context.cpp (100%) rename c/{ => parallel}/src/util/context.h (100%) rename c/{ => parallel}/src/util/errors.cpp (100%) rename c/{ => parallel}/src/util/errors.h (100%) rename c/{ => parallel}/src/util/types.cpp (100%) rename c/{ => parallel}/src/util/types.h (100%) create mode 100644 c/parallel/test/CMakeLists.txt rename c/{ => parallel}/test/c2h.h (100%) rename c/{ => parallel}/test/test_for.cpp (100%) rename c/{ => parallel}/test/test_main.cpp (100%) rename c/{ => parallel}/test/test_reduce.cpp (96%) delete mode 100644 c/test/CMakeLists.txt create mode 100755 ci/build_cccl_c_parallel.sh create mode 100755 ci/test_cccl_c_parallel.sh diff --git a/CMakePresets.json b/CMakePresets.json index 9c28e374ce2..7d611714064 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -21,6 +21,7 @@ "CCCL_ENABLE_CUDAX": false, "CCCL_ENABLE_TESTING": false, "CCCL_ENABLE_EXAMPLES": false, + "CCCL_ENABLE_C": false, "libcudacxx_ENABLE_INSTALL_RULES": true, "CUB_ENABLE_INSTALL_RULES": true, "Thrust_ENABLE_INSTALL_RULES": true, @@ -314,6 +315,16 @@ "cudax_ENABLE_DIALECT_CPP20": true } }, + { + "name": "cccl-c-parallel", + "displayName" : "CCCL C Parallel Library", + "inherits": "base", + "cacheVariables": { + "CCCL_ENABLE_C": true, + "CCCL_C_Parallel_ENABLE_TESTING": true, + "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true + } + }, { "name": "cccl-infra", "displayName": "CCCL Infrastructure", @@ -443,6 +454,10 @@ "name": "cudax-cpp20", "configurePreset": "cudax-cpp20" }, + { + "name": "cccl-c-parallel", + "configurePreset": "cccl-c-parallel" + }, { "name": "cccl-infra", "configurePreset": "cccl-infra" @@ -808,6 +823,11 @@ "configurePreset": "cudax-cpp20", "inherits": "cudax-base" }, + { + "name": "cccl-c-parallel", + "configurePreset": "cccl-c-parallel", + "inherits": "base" + }, { "name": "cccl-infra", "configurePreset": "cccl-infra", diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index e9761c33f2d..7f1dbf4507b 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -1,25 +1 @@ -cmake_minimum_required(VERSION 3.30) - -project(cccl.c LANGUAGES CUDA CXX) - -add_library(cccl.c SHARED - src/reduce.cu src/for.cu - src/for/for_op_helper.cpp - src/util/errors.cpp src/util/types.cpp src/util/context.cpp) - -set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20) -set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20) - -find_package(CUDAToolkit REQUIRED) - -# TODO Use static versions of cudart, nvrtc, and nvJitLink -target_link_libraries(cccl.c PRIVATE CUDA::cudart - CUDA::nvrtc - CUDA::nvJitLink - CUDA::cuda_driver) -target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1) -target_include_directories(cccl.c PUBLIC "include") -target_include_directories(cccl.c PRIVATE "src") - -add_subdirectory(test) +add_subdirectory(parallel) diff --git a/c/parallel/CMakeLists.txt b/c/parallel/CMakeLists.txt new file mode 100644 index 00000000000..0115d9d64c6 --- /dev/null +++ b/c/parallel/CMakeLists.txt @@ -0,0 +1,53 @@ +cmake_minimum_required(VERSION 3.21) + +project(CCCL_C_Parallel LANGUAGES CUDA CXX) + +option(CCCL_C_Parallel_ENABLE_TESTING "Build CUDA Experimental's tests." OFF) +option(CCCL_C_Parallel_ENABLE_HEADER_TESTING "Build CUDA Experimental's standalone headers." OFF) + +# FIXME Ideally this would be handled by presets and install rules, but for now +# consumers may override this to control the target location of cccl.c.parallel. +set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.parallel library") +mark_as_advanced(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY) + +file(GLOB_RECURSE srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + "src/*.cu" "src/*.cpp" +) + +add_library(cccl.c.parallel SHARED ${srcs}) +set_property(TARGET cccl.c.parallel PROPERTY POSITION_INDEPENDENT_CODE ON) +cccl_configure_target(cccl.c.parallel DIALECT 20) + +# Override the properties set by cccl_configure_target: +if (CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY) + set_target_properties(cccl.c.parallel PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}" + ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY}" + ) +endif() + +find_package(CUDAToolkit REQUIRED) + +# TODO Use static versions of cudart, nvrtc, and nvJitLink +target_link_libraries(cccl.c.parallel PRIVATE + CUDA::cudart + CUDA::nvrtc + CUDA::nvJitLink + CUDA::cuda_driver + cccl.compiler_interface_cpp20 +) +target_compile_definitions(cccl.c.parallel PUBLIC CCCL_C_EXPERIMENTAL=1) +target_compile_definitions(cccl.c.parallel PRIVATE NVRTC_GET_TYPE_NAME=1) + +target_include_directories(cccl.c.parallel PUBLIC "include") +target_include_directories(cccl.c.parallel PRIVATE "src") + +if (CCCL_C_Parallel_ENABLE_TESTING) + add_subdirectory(test) +endif() + +if (CCCL_C_Parallel_ENABLE_HEADER_TESTING) + include(cmake/CParallelHeaderTesting.cmake) +endif() diff --git a/c/parallel/cmake/CParallelHeaderTesting.cmake b/c/parallel/cmake/CParallelHeaderTesting.cmake new file mode 100644 index 00000000000..bbb938ab815 --- /dev/null +++ b/c/parallel/cmake/CParallelHeaderTesting.cmake @@ -0,0 +1,11 @@ +# For every public header, build a translation unit containing `#include
` +# to let the compiler try to figure out warnings in that header if it is not otherwise +# included in tests, and also to verify if the headers are modular enough. +# .inl files are not globbed for, because they are not supposed to be used as public +# entrypoints. + +cccl_generate_header_tests(cccl.c.parallel.headers c/parallel/include + DIALECT 20 + GLOBS "cccl/c/*.h" +) +target_link_libraries(cccl.c.parallel.headers PUBLIC cccl.c.parallel) diff --git a/c/include/cccl/c/for.h b/c/parallel/include/cccl/c/for.h similarity index 100% rename from c/include/cccl/c/for.h rename to c/parallel/include/cccl/c/for.h diff --git a/c/include/cccl/c/reduce.h b/c/parallel/include/cccl/c/reduce.h similarity index 100% rename from c/include/cccl/c/reduce.h rename to c/parallel/include/cccl/c/reduce.h diff --git a/c/include/cccl/c/types.h b/c/parallel/include/cccl/c/types.h similarity index 100% rename from c/include/cccl/c/types.h rename to c/parallel/include/cccl/c/types.h diff --git a/c/src/for.cu b/c/parallel/src/for.cu similarity index 99% rename from c/src/for.cu rename to c/parallel/src/for.cu index 2f46c3843d4..4fa32a3e32b 100644 --- a/c/src/for.cu +++ b/c/parallel/src/for.cu @@ -32,7 +32,7 @@ using OffsetT = unsigned long long; static_assert(std::is_same_v, OffsetT>, "OffsetT must be size_t"); static cudaError_t -Invoke(cccl_iterator_t d_in, size_t num_items, cccl_op_t op, int cc, CUfunction static_kernel, CUstream stream) +Invoke(cccl_iterator_t d_in, size_t num_items, cccl_op_t op, int /*cc*/, CUfunction static_kernel, CUstream stream) { cudaError error = cudaSuccess; diff --git a/c/src/for/for_op_helper.cpp b/c/parallel/src/for/for_op_helper.cpp similarity index 98% rename from c/src/for/for_op_helper.cpp rename to c/parallel/src/for/for_op_helper.cpp index b7f4b1e8ae5..247bfbff2b0 100644 --- a/c/src/for/for_op_helper.cpp +++ b/c/parallel/src/for/for_op_helper.cpp @@ -201,7 +201,6 @@ for_each_kernel_state make_for_kernel_state(cccl_op_t op, cccl_iterator_t iterat { // Iterator is either a pointer or a stateful object, allocate space according to its size or alignment size_t iter_size = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.size : sizeof(void*); - size_t iter_align = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.alignment : alignof(void*); void* iterator_state = (cccl_iterator_kind_t::iterator == iterator.type) ? iterator.state : &iterator.state; // Do we need to valid user input? Alignments larger than the provided size? diff --git a/c/src/for/for_op_helper.h b/c/parallel/src/for/for_op_helper.h similarity index 100% rename from c/src/for/for_op_helper.h rename to c/parallel/src/for/for_op_helper.h diff --git a/c/src/reduce.cu b/c/parallel/src/reduce.cu similarity index 98% rename from c/src/reduce.cu rename to c/parallel/src/reduce.cu index 97f8793bb78..a8d111d65af 100644 --- a/c/src/reduce.cu +++ b/c/parallel/src/reduce.cu @@ -64,7 +64,7 @@ static reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N]) return tunings[N - 1]; } -static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info input_type) +static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info /*input_type*/) { reduce_tuning_t chain[] = {{60, 256, 16, 4}, {35, 256, 20, 4}}; @@ -77,7 +77,7 @@ static runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, return {block_size, items_per_thread, vector_load_length}; } -static cccl_type_info get_accumulator_type(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init) +static cccl_type_info get_accumulator_type(cccl_op_t /*op*/, cccl_iterator_t /*input_it*/, cccl_value_t init) { // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet // so switching back to the old accumulator type logic for now @@ -254,7 +254,7 @@ static cudaError_t Invoke( runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type); // Force kernel code-generation in all compiler passes - if (num_items <= (policy.block_size * policy.items_per_thread)) + if (num_items <= static_cast(policy.block_size * policy.items_per_thread)) { // Small, single tile size return InvokeSingleTile( diff --git a/c/src/util/context.cpp b/c/parallel/src/util/context.cpp similarity index 100% rename from c/src/util/context.cpp rename to c/parallel/src/util/context.cpp diff --git a/c/src/util/context.h b/c/parallel/src/util/context.h similarity index 100% rename from c/src/util/context.h rename to c/parallel/src/util/context.h diff --git a/c/src/util/errors.cpp b/c/parallel/src/util/errors.cpp similarity index 100% rename from c/src/util/errors.cpp rename to c/parallel/src/util/errors.cpp diff --git a/c/src/util/errors.h b/c/parallel/src/util/errors.h similarity index 100% rename from c/src/util/errors.h rename to c/parallel/src/util/errors.h diff --git a/c/src/util/types.cpp b/c/parallel/src/util/types.cpp similarity index 100% rename from c/src/util/types.cpp rename to c/parallel/src/util/types.cpp diff --git a/c/src/util/types.h b/c/parallel/src/util/types.h similarity index 100% rename from c/src/util/types.h rename to c/parallel/src/util/types.h diff --git a/c/parallel/test/CMakeLists.txt b/c/parallel/test/CMakeLists.txt new file mode 100644 index 00000000000..fae1160dec0 --- /dev/null +++ b/c/parallel/test/CMakeLists.txt @@ -0,0 +1,40 @@ +cccl_get_catch2() + +function(cccl_c_parallel_add_test target_name_var source) + string(REGEX REPLACE "test_([^.]*)" "cccl.c.parallel.test.\\1" target_name "${source}") + set(target_name_var ${target_name} PARENT_SCOPE) + + add_executable(${target_name} + "${source}" + test_main.cpp + ) + cccl_configure_target(${target_name} DIALECT 20) + + target_link_libraries(${target_name} PRIVATE + cccl.c.parallel + CUDA::cudart + CUDA::nvrtc + Catch2::Catch2 + cccl.compiler_interface_cpp20 + ) + + target_compile_definitions(${target_name} PRIVATE + TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub" + TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub" + TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include" + TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}" + ) + + add_test(NAME ${target_name} COMMAND ${target_name}) +endfunction() + +file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + *.cu *.cpp +) +list(REMOVE_ITEM test_srcs test_main.cpp) + +foreach(test_src IN LISTS test_srcs) + cccl_c_parallel_add_test(test_target "${test_src}") +endforeach() diff --git a/c/test/c2h.h b/c/parallel/test/c2h.h similarity index 100% rename from c/test/c2h.h rename to c/parallel/test/c2h.h diff --git a/c/test/test_for.cpp b/c/parallel/test/test_for.cpp similarity index 100% rename from c/test/test_for.cpp rename to c/parallel/test/test_for.cpp diff --git a/c/test/test_main.cpp b/c/parallel/test/test_main.cpp similarity index 100% rename from c/test/test_main.cpp rename to c/parallel/test/test_main.cpp diff --git a/c/test/test_reduce.cpp b/c/parallel/test/test_reduce.cpp similarity index 96% rename from c/test/test_reduce.cpp rename to c/parallel/test/test_reduce.cpp index c98f350390a..74f00c09507 100644 --- a/c/test/test_reduce.cpp +++ b/c/parallel/test/test_reduce.cpp @@ -48,7 +48,7 @@ void reduce(cccl_iterator_t input, cccl_iterator_t output, unsigned long long nu using integral_types = std::tuple; TEMPLATE_LIST_TEST_CASE("Reduce works with integral types", "[reduce]", integral_types) { - const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); + const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); const std::vector input = generate(num_items); pointer_t input_ptr(input); @@ -70,7 +70,7 @@ struct pair TEST_CASE("Reduce works with custom types", "[reduce]") { - const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); + const std::size_t num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); operation_t op = make_operation( "op", @@ -204,8 +204,9 @@ TEST_CASE("Reduce works with input and output iterators", "[reduce]") TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]") { - const int num_items = 1 << 14; // 16384 > 128 - operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); + const std::size_t num_items = 1 << 14; // 16384 > 128 + + operation_t op = make_operation("op", get_reduce_op(get_type_info().type)); iterator_t> input_it = make_iterator>( "struct constant_iterator_state_t { char value; };\n", {"in_advance", @@ -221,8 +222,8 @@ TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]") reduce(input_it, output_it, num_items, op, init); - const size_t output = output_it[0]; - const int expected = init.value + num_items; + const size_t output = output_it[0]; + const size_t expected = init.value + num_items; REQUIRE(output == expected); } diff --git a/c/test/CMakeLists.txt b/c/test/CMakeLists.txt deleted file mode 100644 index a9223faa4dc..00000000000 --- a/c/test/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -add_executable(cccl.c.test.reduce test_reduce.cpp test_main.cpp) -add_executable(cccl.c.test.for test_for.cpp test_main.cpp) - -target_link_libraries(cccl.c.test.reduce PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2) -target_link_libraries(cccl.c.test.for PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2) - -target_compile_definitions(cccl.c.test.reduce PRIVATE CCCL_C_EXPERIMENTAL - TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub" - TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub" - TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include" - TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}") - -target_compile_definitions(cccl.c.test.for PRIVATE CCCL_C_EXPERIMENTAL - TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub" - TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub" - TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include" - TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}") diff --git a/ci/build_cccl_c_parallel.sh b/ci/build_cccl_c_parallel.sh new file mode 100755 index 00000000000..5c59815fe14 --- /dev/null +++ b/ci/build_cccl_c_parallel.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -euo pipefail + +source "$(dirname "$0")/build_common.sh" + +print_environment_details + +PRESET="cccl-c-parallel" + +CMAKE_OPTIONS="" + +configure_and_build_preset "CCCL C Parallel Library" "$PRESET" "$CMAKE_OPTIONS" + +print_time_summary diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh index 72c37ba9c57..ddf25e6260e 100755 --- a/ci/inspect_changes.sh +++ b/ci/inspect_changes.sh @@ -27,7 +27,7 @@ subprojects=( thrust cudax pycuda - c + cccl_c_parallel ) # ...and their dependencies: @@ -37,8 +37,8 @@ declare -A dependencies=( [cub]="cccl libcudacxx thrust" [thrust]="cccl libcudacxx cub" [cudax]="cccl libcudacxx" - [pycuda]="cccl libcudacxx cub thrust c" - [c]="cccl libcudacxx cub" + [pycuda]="cccl libcudacxx cub thrust cccl_c_parallel" + [cccl_c_parallel]="cccl libcudacxx cub thrust" ) declare -A project_names=( @@ -48,7 +48,7 @@ declare -A project_names=( [thrust]="Thrust" [cudax]="CUDA Experimental" [pycuda]="pycuda" - [c]="CUDA C Core Library " + [cccl_c_parallel]="CCCL C Parallel Library" ) # By default, the project directory is assumed to be the same as the subproject name, @@ -56,6 +56,7 @@ declare -A project_names=( # of any subproject directory. declare -A project_dirs=( [pycuda]="python/cuda_cooperative" + [cccl_c_parallel]="c/parallel" ) # Usage checks: diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 174c912d0b2..ae1ab5671d1 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -46,8 +46,8 @@ workflows: - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['gcc12']} - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'max', cxx: ['clang14']} - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'max', cxx: ['clang18']} - # Python jobs: - - {jobs: ['test'], project: 'pycuda', ctk: ['12.5']} + # Python and c/parallel jobs: + - {jobs: ['test'], project: ['cccl_c_parallel', 'pycuda'], ctk: '12.5'} # cccl-infra: - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']} - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']} @@ -233,6 +233,9 @@ projects: pycuda: name: "cuda (python)" job_map: { build: [], test: ['test_nobuild'] } + cccl_c_parallel: + name: 'CCCL C Parallel' + stds: [20] # testing -> Runner with GPU is in a nv-gh-runners testing pool gpus: diff --git a/ci/test_cccl_c_parallel.sh b/ci/test_cccl_c_parallel.sh new file mode 100755 index 00000000000..852869cc1af --- /dev/null +++ b/ci/test_cccl_c_parallel.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +source "$(dirname "$0")/build_common.sh" + +print_environment_details + +./build_cccl_c_parallel.sh "$@" + +PRESET="cccl-c-parallel" + +test_preset "CCCL C Parallel Library" ${PRESET} + +print_time_summary diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py index 0fa2d09df11..36813a737a9 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/__init__.py +++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py @@ -146,7 +146,7 @@ def _get_bindings(): if _bindings is None: include_path = importlib.resources.files( 'cuda.parallel.experimental').joinpath('cccl') - cccl_c_path = os.path.join(include_path, 'libcccl.c.so') + cccl_c_path = os.path.join(include_path, 'libcccl.c.parallel.so') _bindings = ctypes.CDLL(cccl_c_path) _bindings.cccl_device_reduce.restype = ctypes.c_int _bindings.cccl_device_reduce.restype = ctypes.c_int diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py index c29a5237fc0..3a25f7d89d1 100644 --- a/python/cuda_parallel/setup.py +++ b/python/cuda_parallel/setup.py @@ -77,10 +77,8 @@ def build_extension(self, ext): extdir = os.path.abspath(os.path.dirname( self.get_ext_fullpath(ext.name))) cmake_args = [ - '-DCCCL_ENABLE_CUB=YES', - '-DCCCL_ENABLE_THRUST=YES', '-DCCCL_ENABLE_C=YES', - '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, + '-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY=' + extdir, '-DCMAKE_BUILD_TYPE=Release', ] @@ -90,7 +88,7 @@ def build_extension(self, ext): subprocess.check_call(['cmake', cccl_path] + cmake_args, cwd=self.build_temp) subprocess.check_call( - ['cmake', '--build', '.', '--target', 'cccl.c'], cwd=self.build_temp) + ['cmake', '--build', '.', '--target', 'cccl.c.parallel'], cwd=self.build_temp) setup(