Skip to content

Commit

Permalink
Add Scan implementation for c.parallel (#3462)
Browse files Browse the repository at this point in the history
* Make thread_store.cuh NVRTC compilable

* Get TileState from KernelSource

* Use launcher_factory to get the SM occupancy, PTX version. Implement MaxgridDimX

* Put reduce stuff inside `reduce` namespace

* Handle PTX compilation in command list

* Missing noexcept

* Allow passing InitValueT without wrapping in InputValue

* Add scan c.parallel API

* Add tests for scan c.parallel API

* Use fewer items per thread and reinstate LDL/STL check

* Move load modifier check to policy

* Introduce detail functions to allocate/initialize tile state

* Update c.parallel scan_tile_state following c++ refactor

* Update cub/cub/thread/thread_store.cuh

Co-authored-by: Bernhard Manfred Gruber <[email protected]>

* No initialize-then-modify

* Use enum rather than bool

* Return a std::optional from find_size_t

* Annotate arguments with their positions

* Minor improvements to command_list

* Rename cubin->link_result

* Add a TODO for removing extra compile step

* Bad merge

* Pass thrust path to PTX compile step

* Fixes following merge from main

* Return error from AliasTemporaries

* Fix SFINAE

* Store description/payload bytes_per_tile directly in the build obj

---------

Co-authored-by: Ashwin Srinath <[email protected]>
Co-authored-by: Bernhard Manfred Gruber <[email protected]>
  • Loading branch information
3 people authored Feb 11, 2025
1 parent f745c97 commit bc57f2b
Show file tree
Hide file tree
Showing 17 changed files with 1,029 additions and 137 deletions.
2 changes: 1 addition & 1 deletion c/parallel/include/cccl/c/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
cccl_value_t init,
CUstream stream) noexcept;

extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr);
extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept;
58 changes: 58 additions & 0 deletions c/parallel/include/cccl/c/scan.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA Core Compute Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#pragma once

#ifndef CCCL_C_EXPERIMENTAL
# error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
#endif // !CCCL_C_EXPERIMENTAL

#include <cuda.h>

#include <cccl/c/types.h>

struct cccl_device_scan_build_result_t
{
int cc;
void* cubin;
size_t cubin_size;
CUlibrary library;
cccl_type_info accumulator_type;
CUkernel init_kernel;
CUkernel scan_kernel;
size_t description_bytes_per_tile;
size_t payload_bytes_per_tile;
};

extern "C" CCCL_C_API CUresult cccl_device_scan_build(
cccl_device_scan_build_result_t* build,
cccl_iterator_t d_in,
cccl_iterator_t d_out,
cccl_op_t op,
cccl_value_t init,
int cc_major,
int cc_minor,
const char* cub_path,
const char* thrust_path,
const char* libcudacxx_path,
const char* ctk_path) noexcept;

extern "C" CCCL_C_API CUresult cccl_device_scan(
cccl_device_scan_build_result_t build,
void* d_temp_storage,
size_t* temp_storage_bytes,
cccl_iterator_t d_in,
cccl_iterator_t d_out,
unsigned long long num_items,
cccl_op_t op,
cccl_value_t init,
CUstream stream) noexcept;

extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept;
6 changes: 3 additions & 3 deletions c/parallel/src/for.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
.cleanup_program()
.add_link({op.ltoir, op.ltoir_size});

nvrtc_cubin result{};
nvrtc_link_result result{};

if (cccl_iterator_kind_t::iterator == d_data.type)
{
Expand All @@ -124,11 +124,11 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
result = cl.finalize_program(num_lto_args, lopts);
}

cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
check(cuLibraryGetKernel(&build->static_kernel, build->library, lowered_name.c_str()));

build->cc = cc;
build->cubin = (void*) result.cubin.release();
build->cubin = (void*) result.data.release();
build->cubin_size = result.size;
}
catch (...)
Expand Down
2 changes: 1 addition & 1 deletion c/parallel/src/kernels/operators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ make_kernel_binary_operator_full_source(std::string_view input_t, cccl_op_t oper
: std::format(stateful_binary_op_template, return_type));
}

std::string make_kernel_user_arithmetic_operator(std::string_view input_t, cccl_op_t operation)
std::string make_kernel_user_binary_operator(std::string_view input_t, cccl_op_t operation)
{
return make_kernel_binary_operator_full_source(input_t, operation, "VALUE_T");
}
Expand Down
2 changes: 1 addition & 1 deletion c/parallel/src/kernels/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@

#include <cccl/c/types.h>

std::string make_kernel_user_arithmetic_operator(std::string_view input_value_t, cccl_op_t operation);
std::string make_kernel_user_binary_operator(std::string_view input_value_t, cccl_op_t operation);

std::string make_kernel_user_comparison_operator(std::string_view input_value_t, cccl_op_t operation);
6 changes: 3 additions & 3 deletions c/parallel/src/merge_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build(
ltoir_list_append({output_items_it.dereference.ltoir, output_items_it.dereference.ltoir_size});
}

nvrtc_cubin result =
nvrtc_link_result result =
make_nvrtc_command_list()
.add_program(nvrtc_translation_unit{src.c_str(), name})
.add_expression({block_sort_kernel_name})
Expand All @@ -424,13 +424,13 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build(
.add_link_list(ltoir_list)
.finalize_program(num_lto_args, lopts);

cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
check(cuLibraryGetKernel(&build->block_sort_kernel, build->library, block_sort_kernel_lowered_name.c_str()));
check(cuLibraryGetKernel(&build->partition_kernel, build->library, partition_kernel_lowered_name.c_str()));
check(cuLibraryGetKernel(&build->merge_kernel, build->library, merge_kernel_lowered_name.c_str()));

build->cc = cc;
build->cubin = (void*) result.cubin.release();
build->cubin = (void*) result.data.release();
build->cubin_size = result.size;
}
catch (const std::exception& exc)
Expand Down
34 changes: 24 additions & 10 deletions c/parallel/src/nvrtc/command_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
#include <nvrtc.h>
#include <util/errors.h>

struct nvrtc_cubin
struct nvrtc_link_result
{
std::unique_ptr<char[]> cubin{};
std::unique_ptr<char[]> data{};
size_t size;
};

Expand Down Expand Up @@ -57,7 +57,7 @@ struct nvrtc_ltoir
using nvrtc_ltoir_list = std::vector<nvrtc_ltoir>;
struct nvrtc_jitlink_cleanup
{
nvrtc_cubin& cubin_ref;
nvrtc_link_result& link_result_ref;
};

struct nvrtc_jitlink
Expand Down Expand Up @@ -156,9 +156,23 @@ struct nvrtc_command_list_visitor

check(jitlink_error);

check(nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size));
cleanup.cubin_ref.cubin = std::unique_ptr<char[]>(new char[cleanup.cubin_ref.size]);
check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get()));
bool output_ptx = false;
auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.link_result_ref.size);
if (result != NVJITLINK_SUCCESS)
{
output_ptx = true;
check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.link_result_ref.size));
}
cleanup.link_result_ref.data = std::unique_ptr<char[]>(new char[cleanup.link_result_ref.size]);

if (output_ptx)
{
check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.link_result_ref.data.get()));
}
else
{
check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.link_result_ref.data.get()));
}
}
};

Expand Down Expand Up @@ -231,13 +245,13 @@ struct nvrtc_sm_top_level
}

// Execute steps and link unit
nvrtc_cubin finalize_program(uint32_t numLtoOpts, const char** ltoOpts)
nvrtc_link_result finalize_program(uint32_t numLtoOpts, const char** ltoOpts)
{
nvrtc_cubin cubin{};
nvrtc_jitlink_cleanup cleanup{cubin};
nvrtc_link_result link_result{};
nvrtc_jitlink_cleanup cleanup{link_result};
nvrtc_jitlink jl(numLtoOpts, ltoOpts);
std::apply(nvrtc_command_list_visitor{jl}, nvrtc_command_list_append(std::move(cl), std::move(cleanup)));
return cubin;
return link_result;
}
};

Expand Down
Loading

0 comments on commit bc57f2b

Please sign in to comment.