Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Scan implementation for c.parallel #3462

Merged
merged 27 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
502daa6
Make thread_store.cuh NVRTC compilable
shwina Jan 27, 2025
3022bd9
Get TileState from KernelSource
shwina Jan 27, 2025
418a763
Use launcher_factory to get the SM occupancy, PTX version. Implement …
shwina Jan 27, 2025
6b38605
Put reduce stuff inside `reduce` namespace
shwina Jan 27, 2025
6365d5c
Handle PTX compilation in command list
shwina Jan 27, 2025
f3454c5
Missing noexcept
shwina Jan 27, 2025
3fe0ded
Allow passing InitValueT without wrapping in InputValue
shwina Jan 27, 2025
e4a1aea
Add scan c.parallel API
shwina Jan 27, 2025
c953d6e
Add tests for scan c.parallel API
shwina Jan 27, 2025
59a125a
Use fewer items per thread and reinstate LDL/STL check
shwina Jan 30, 2025
9925e11
Move load modifier check to policy
shwina Jan 30, 2025
c71bebe
Introduce detail functions to allocate/initialize tile state
shwina Jan 31, 2025
08d443e
Update c.parallel scan_tile_state following c++ refactor
shwina Feb 1, 2025
4060f59
Update cub/cub/thread/thread_store.cuh
shwina Feb 3, 2025
0a9b9b5
No initialize-then-modify
shwina Feb 3, 2025
39d6a16
Use enum rather than bool
shwina Feb 4, 2025
02f6512
Return a std::optional from find_size_t
shwina Feb 4, 2025
2f2cea1
Annotate arguments with their positions
shwina Feb 4, 2025
6ab3628
Minor improvements to command_list
shwina Feb 4, 2025
ff41638
Rename cubin->link_result
shwina Feb 4, 2025
c4102fd
Add a TODO for removing extra compile step
shwina Feb 4, 2025
e3a2e75
Bad merge
shwina Feb 7, 2025
fd63f70
Pass thrust path to PTX compile step
shwina Feb 7, 2025
9fe8dfe
Fixes following merge from main
shwina Feb 10, 2025
f0b1ed8
Return error from AliasTemporaries
shwina Feb 11, 2025
f20219b
Fix SFINAE
shwina Feb 11, 2025
7a83fbc
Store description/payload bytes_per_tile directly in the build obj
shwina Feb 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion c/parallel/include/cccl/c/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ extern "C" CCCL_C_API CUresult cccl_device_reduce(
cccl_value_t init,
CUstream stream) noexcept;

extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr);
extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) noexcept;
58 changes: 58 additions & 0 deletions c/parallel/include/cccl/c/scan.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA Core Compute Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#pragma once

#ifndef CCCL_C_EXPERIMENTAL
# error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
#endif // !CCCL_C_EXPERIMENTAL

#include <cuda.h>

#include <cccl/c/types.h>

struct cccl_device_scan_build_result_t
{
int cc;
void* cubin;
size_t cubin_size;
CUlibrary library;
cccl_type_info accumulator_type;
CUkernel init_kernel;
CUkernel scan_kernel;
size_t description_bytes_per_tile;
size_t payload_bytes_per_tile;
};

extern "C" CCCL_C_API CUresult cccl_device_scan_build(
cccl_device_scan_build_result_t* build,
cccl_iterator_t d_in,
cccl_iterator_t d_out,
cccl_op_t op,
cccl_value_t init,
int cc_major,
int cc_minor,
const char* cub_path,
const char* thrust_path,
const char* libcudacxx_path,
const char* ctk_path) noexcept;

extern "C" CCCL_C_API CUresult cccl_device_scan(
cccl_device_scan_build_result_t build,
void* d_temp_storage,
size_t* temp_storage_bytes,
cccl_iterator_t d_in,
cccl_iterator_t d_out,
unsigned long long num_items,
cccl_op_t op,
cccl_value_t init,
CUstream stream) noexcept;

extern "C" CCCL_C_API CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* bld_ptr) noexcept;
6 changes: 3 additions & 3 deletions c/parallel/src/for.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
.cleanup_program()
.add_link({op.ltoir, op.ltoir_size});

nvrtc_cubin result{};
nvrtc_link_result result{};

if (cccl_iterator_kind_t::iterator == d_data.type)
{
Expand All @@ -124,11 +124,11 @@ extern "C" CCCL_C_API CUresult cccl_device_for_build(
result = cl.finalize_program(num_lto_args, lopts);
}

cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
check(cuLibraryGetKernel(&build->static_kernel, build->library, lowered_name.c_str()));

build->cc = cc;
build->cubin = (void*) result.cubin.release();
build->cubin = (void*) result.data.release();
build->cubin_size = result.size;
}
catch (...)
Expand Down
2 changes: 1 addition & 1 deletion c/parallel/src/kernels/operators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ make_kernel_binary_operator_full_source(std::string_view input_t, cccl_op_t oper
: std::format(stateful_binary_op_template, return_type));
}

std::string make_kernel_user_arithmetic_operator(std::string_view input_t, cccl_op_t operation)
std::string make_kernel_user_binary_operator(std::string_view input_t, cccl_op_t operation)
{
return make_kernel_binary_operator_full_source(input_t, operation, "VALUE_T");
}
Expand Down
2 changes: 1 addition & 1 deletion c/parallel/src/kernels/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@

#include <cccl/c/types.h>

std::string make_kernel_user_arithmetic_operator(std::string_view input_value_t, cccl_op_t operation);
std::string make_kernel_user_binary_operator(std::string_view input_value_t, cccl_op_t operation);

std::string make_kernel_user_comparison_operator(std::string_view input_value_t, cccl_op_t operation);
6 changes: 3 additions & 3 deletions c/parallel/src/merge_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build(
ltoir_list_append({output_items_it.dereference.ltoir, output_items_it.dereference.ltoir_size});
}

nvrtc_cubin result =
nvrtc_link_result result =
make_nvrtc_command_list()
.add_program(nvrtc_translation_unit{src.c_str(), name})
.add_expression({block_sort_kernel_name})
Expand All @@ -424,13 +424,13 @@ extern "C" CCCL_C_API CUresult cccl_device_merge_sort_build(
.add_link_list(ltoir_list)
.finalize_program(num_lto_args, lopts);

cuLibraryLoadData(&build->library, result.cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
cuLibraryLoadData(&build->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
check(cuLibraryGetKernel(&build->block_sort_kernel, build->library, block_sort_kernel_lowered_name.c_str()));
check(cuLibraryGetKernel(&build->partition_kernel, build->library, partition_kernel_lowered_name.c_str()));
check(cuLibraryGetKernel(&build->merge_kernel, build->library, merge_kernel_lowered_name.c_str()));

build->cc = cc;
build->cubin = (void*) result.cubin.release();
build->cubin = (void*) result.data.release();
build->cubin_size = result.size;
}
catch (const std::exception& exc)
Expand Down
34 changes: 24 additions & 10 deletions c/parallel/src/nvrtc/command_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
#include <nvrtc.h>
#include <util/errors.h>

struct nvrtc_cubin
struct nvrtc_link_result
{
std::unique_ptr<char[]> cubin{};
std::unique_ptr<char[]> data{};
size_t size;
};

Expand Down Expand Up @@ -57,7 +57,7 @@ struct nvrtc_ltoir
using nvrtc_ltoir_list = std::vector<nvrtc_ltoir>;
struct nvrtc_jitlink_cleanup
{
nvrtc_cubin& cubin_ref;
nvrtc_link_result& link_result_ref;
};

struct nvrtc_jitlink
Expand Down Expand Up @@ -156,9 +156,23 @@ struct nvrtc_command_list_visitor

check(jitlink_error);

check(nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.cubin_ref.size));
cleanup.cubin_ref.cubin = std::unique_ptr<char[]>(new char[cleanup.cubin_ref.size]);
check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.cubin_ref.cubin.get()));
bool output_ptx = false;
auto result = nvJitLinkGetLinkedCubinSize(jitlink.handle, &cleanup.link_result_ref.size);
if (result != NVJITLINK_SUCCESS)
{
output_ptx = true;
check(nvJitLinkGetLinkedPtxSize(jitlink.handle, &cleanup.link_result_ref.size));
}
cleanup.link_result_ref.data = std::unique_ptr<char[]>(new char[cleanup.link_result_ref.size]);

if (output_ptx)
{
check(nvJitLinkGetLinkedPtx(jitlink.handle, cleanup.link_result_ref.data.get()));
}
else
{
check(nvJitLinkGetLinkedCubin(jitlink.handle, cleanup.link_result_ref.data.get()));
}
}
};

Expand Down Expand Up @@ -231,13 +245,13 @@ struct nvrtc_sm_top_level
}

// Execute steps and link unit
nvrtc_cubin finalize_program(uint32_t numLtoOpts, const char** ltoOpts)
nvrtc_link_result finalize_program(uint32_t numLtoOpts, const char** ltoOpts)
{
nvrtc_cubin cubin{};
nvrtc_jitlink_cleanup cleanup{cubin};
nvrtc_link_result link_result{};
nvrtc_jitlink_cleanup cleanup{link_result};
nvrtc_jitlink jl(numLtoOpts, ltoOpts);
std::apply(nvrtc_command_list_visitor{jl}, nvrtc_command_list_append(std::move(cl), std::move(cleanup)));
return cubin;
return link_result;
}
};

Expand Down
Loading
Loading