From 87ae049378eed3120c19924bcc3f2cec2fc1d90f Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 31 Jul 2020 17:22:32 -0400 Subject: [PATCH] WIP Adding async scan algorithms. --- dependencies/cub | 2 +- testing/CMakeLists.txt | 2 + testing/async_exclusive_scan.cu | 424 ++++++++++++++++++ testing/async_inclusive_scan.cu | 108 +++++ testing/unittest/util_async.h | 257 ++++++++++- thrust/async/scan.h | 355 +++++++++++++++ .../system/cuda/detail/async/exclusive_scan.h | 180 ++++++++ .../system/cuda/detail/async/inclusive_scan.h | 173 +++++++ thrust/system/cuda/detail/async/scan.h | 33 ++ thrust/system/detail/adl/async/scan.h | 34 ++ thrust/system/detail/generic/scan.inl | 4 +- 11 files changed, 1545 insertions(+), 27 deletions(-) create mode 100644 testing/async_exclusive_scan.cu create mode 100644 testing/async_inclusive_scan.cu create mode 100644 thrust/async/scan.h create mode 100644 thrust/system/cuda/detail/async/exclusive_scan.h create mode 100644 thrust/system/cuda/detail/async/inclusive_scan.h create mode 100644 thrust/system/cuda/detail/async/scan.h create mode 100644 thrust/system/detail/adl/async/scan.h diff --git a/dependencies/cub b/dependencies/cub index 2749cb0c7b..99ac8fe71e 160000 --- a/dependencies/cub +++ b/dependencies/cub @@ -1 +1 @@ -Subproject commit 2749cb0c7bc5a72c806d7ca0b8e4d702dbe017e5 +Subproject commit 99ac8fe71e937d57c594c38fe2d3ee9fd99e1a1b diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index fdfc04e97b..ca6ed63247 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -20,9 +20,11 @@ add_subdirectory(unittest) # List of tests that aren't implemented for all backends, but are implemented for CUDA. set(partially_implemented_CUDA async_copy + async_exclusive_scan async_for_each async_reduce async_reduce_into + async_inclusive_scan async_sort async_transform event diff --git a/testing/async_exclusive_scan.cu b/testing/async_exclusive_scan.cu new file mode 100644 index 0000000000..47b1d6d168 --- /dev/null +++ b/testing/async_exclusive_scan.cu @@ -0,0 +1,424 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include + +#include + +#include +#include + +#include +#include + +// TODO Finish implementing tests. Draw from other async algos, as well as +// the older scan tests. + +// X Add a description string method to AlgoDef and add it to any thrown +// exceptions +// - Change the num_values NTTP to a runtime variable for better integration +// with the existing framework. +// - For a large number of values, the test will fail for floating point numbers +// because we're comparing the cpp and cuda/async backends and fp adds are +// non-associative. +// - CUB's scans guarantee deterministic results, but only when on the exact +// same device. +// - Don't want to test against synch cuda backend, that'll just compare +// cub::DeviceScan to cub::DeviceScan. +// - Quick test using fuzzy compare for FP types had issues once +// custom_numeric was shown to an ASSERT_ALMOST_EQUAL. +// - Should probably fix ASSERT_ALMOST_EQUAL to work with custom_numeric. + +// - Build def using mixins for e.g. counting_iterator input, discard outputs, +// - need a hook to assemble output objects +// - Rename generate_input() to initialize_input() +// - Add initialize_output() +// - Run the various def assemblies through tests of: +// - Different input/output combinations (e.g. int -> float, etc) +// - TestScanMixedTypes +// - Validate intermediate types, casting order, etc +// - With various initial value specification types, too. +// - Custom binary_operator +// - Should just an additional entry in postfix_args +// - In-place execution + +// From testing/scan.cu: +// - TestInclusiveScanDispatchExplicit +// - TestInclusiveScanDispatchImplicit +// - TestScanWithDiscardIterator +// - TestScanWithLargeTypes +// - TestInclusiveScanWithBigIndexes +// - TestInclusiveScanWithUserDefinedType + +// From testing/async_reduce.cu: +// - test_async_reduce +// - test_async_reduce_counting_iterator +// - test_async_reduce_counting_iterator +// - test_async_reduce_using (wtf sometimes I hate c++ so much) +// - test_async_reduce_after (can this be worked into the framework?) +// - test_async_reduce_on_then_after (can this be worked into the framework?) +// - all the child variants (e.g. with allocator) too +// - test_async_reduce_caching +// - test_async_copy_then_reduce + +// template +// struct async_exclusive_scan_def +//{ +// public: +// // Input and output types for the algorithms: +// using input_type = thrust::device_vector; +// using output_type = thrust::device_vector; +// +// using postfix_args_type = std::tuple< // List any extra arg overloads: +// std::tuple<>, // - no extra args +// std::tuple, // - initial_value +// std::tuple> // - initial_value, binary_op +// >; +// +// // Create instances of the extra arguments to use when invoking the +// // algorithm: +// static postfix_args_type generate_postfix_args() +// { +// return { +// {}, // no extra args +// {42}, // initial_value +// {57, thrust::maximum<>{}} // initial_value, binary_op +// }; +// } +// +// // Generate an instance of the input: +// static input_type generate_input() +// { +// input_type input(1024); +// thrust::sequence(input.begin(), input.end(), 25, 3); +// return input; +// } +// +// // Generate an instance of the input: +// // Might be more complicated, eg. fancy iterators, etc +// static output_type generate_output() { return output_type{}; } +// +// // Invoke a reference implementation for a single overload as described by +// // postfix_tuple. This tuple contains instances of any additional arguments +// // to pass to the algorithm. The tuple/index_sequence pattern is used to +// // support the "no extra args" overload, since the parameter pack expansion +// // will do exactly what we want in all cases. +// template +// static void invoke_reference(input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// // Create host versions of the input/output: +// thrust::host_vector host_input(input); +// thrust::host_vector host_output(input.size()); +// +// // Run host synchronous algorithm to generate reference. +// thrust::exclusive_scan(host_input.cbegin(), +// host_input.cend(), +// host_output.begin(), +// std::get( +// THRUST_FWD(postfix_tuple))...); +// +// // Copy back to device. +// output = host_output; +// } +// +// // Invoke the async algorithm for a single overload as described by +// // the prefix and postfix tuples. These tuples contains instances of any +// // additional arguments to pass to the algorithm. The tuple/index_sequence +// // pattern is used to support the "no extra args" overload, since the +// // parameter pack expansion will do exactly what we want in all cases. +// // Prefix args are included here (but not for invoke_reference) to allow the +// // test framework to change the execution policy. +// // This method must return an event or future. +// template +// static auto invoke_async(PrefixArgTuple &&prefix_tuple, +// std::index_sequence, +// input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// output.resize(input.size()); +// auto e = thrust::async::exclusive_scan( +// std::get(THRUST_FWD(prefix_tuple))..., +// input.cbegin(), +// input.cend(), +// output.begin(), +// std::get(THRUST_FWD(postfix_tuple))...); +// return e; +// } +// +// // Wait on and validate the event/future (usually with TEST_EVENT_WAIT / +// // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output +// matches +// // the testing output. +// template +// static void compare_outputs(EventType &e, +// output_type const &ref, +// output_type const &test) +// { +// TEST_EVENT_WAIT(e); +// ASSERT_EQUAL_QUIET(ref, test); +// } +//}; + +// Trying to find a sensible decomposition of the above into mixins that +// can be reused / worked into the framework. + +namespace mixin +{ + +namespace input +{ + +// TODO it'd be nice to specify a lambda expression to replace the call to +// thrust::sequence when needed. +template +struct device_vector +{ + using input_type = thrust::device_vector; + + static input_type generate_input() + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + // fractional values are chosen for test ScanMixedTypes: + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +} // namespace input + +namespace output +{ + +template +struct device_vector +{ + using output_type = thrust::device_vector; + + static output_type generate_output() { return output_type(num_values); } +}; + +} // namespace output + +namespace postfix_args +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple // - initial_value, binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return {{}, {42}, {42, alternate_binary_op{}}}; + } +}; + +// Used by ScanMixedTypes. A fractional value is used to ensure that a different +// result is obtained when using float vs. int. +template +struct exclusive_scan_mixed_types_overloads +{ + using postfix_args_type = std::tuple< // Overloads to test: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple>, // - initial_value, plus<> + std::tuple>, // - initial_value, plus + std::tuple> // - initial_value, plus + >; + + static postfix_args_type generate_postfix_args() + { + return {{}, + {static_cast(5.5)}, + {static_cast(5.5), thrust::plus<>{}}, + {static_cast(5.5), thrust::plus{}}, + {static_cast(5.5), thrust::plus{}}}; + } +}; + +} // namespace postfix_args + +namespace invoke_reference +{ + +template +struct host_synchronous +{ + template + static void invoke_reference(InputType const &input, + OutputType &output, + PostfixArgTuple &&postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input); + thrust::host_vector host_output(input.size()); + + // Run host synchronous algorithm to generate reference. + thrust::exclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get( + THRUST_FWD(postfix_tuple))...); + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +namespace invoke_async +{ + +struct basic +{ + template + static auto invoke_async(PrefixArgTuple &&prefix_tuple, + std::index_sequence, + InputType const &input, + OutputType &output, + PostfixArgTuple &&postfix_tuple, + std::index_sequence) + { + auto e = thrust::async::exclusive_scan( + std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +namespace compare_outputs +{ + +struct assert_equal_quiet +{ + template + static void compare_outputs(EventType &e, + OutputType const &ref, + OutputType const &test) + { + TEST_EVENT_WAIT(e); + ASSERT_EQUAL(ref, test); + } +}; + +} // namespace compare_outputs + +} // namespace mixin + +template > +struct basic_invoker + : mixin::input::device_vector + , mixin::output::device_vector + , mixin::postfix_args::all_overloads + , mixin::invoke_reference::host_synchronous + , mixin::invoke_async::basic + , mixin::compare_outputs::assert_equal_quiet +{ + static std::string description() + { + return "basic invocation with device vectors"; + } +}; + +template +struct TestBasic +{ + void operator()() const + { + unittest::test_async_policy_overloads>::run(); + } +}; +// TODO GENERIC_SIZED_UNITTEST: +DECLARE_GENERIC_UNITTEST_WITH_TYPES(TestBasic, NumericTypes); + +template +struct mixed_types_invoker + : mixin::input::device_vector + , mixin::output::device_vector + , mixin::postfix_args::exclusive_scan_mixed_types_overloads< + initial_value_type> + , mixin::invoke_reference::host_synchronous + , mixin::invoke_async::basic + , mixin::compare_outputs::assert_equal_quiet +{ + static std::string description() + { + return "mixed input/output/initial type tests"; + } +}; + +void TestScanMixedTypes() +{ + // Test using mixed int/float types for: + // - input_value_type | (int, float) + // - output_value_type | (int, float) + // - initial_value_type | (int, float, ) + // - thrust::plus T-type | (int, float, void>) + // + // The initial_value_type and thrust::plus types are covered by the + // mixin::postfix_args::scan_mixed_types_overloads component. + + // invoker template params are input_value_type, output_vt, initial_vt: + unittest::test_async_policy_overloads< + mixed_types_invoker>::run(); + unittest::test_async_policy_overloads< + mixed_types_invoker>::run(); + unittest::test_async_policy_overloads< + mixed_types_invoker>::run(); + unittest::test_async_policy_overloads< + mixed_types_invoker>::run(); + unittest::test_async_policy_overloads< + mixed_types_invoker>::run(); + unittest::test_async_policy_overloads< + mixed_types_invoker>::run(); + unittest::test_async_policy_overloads< + mixed_types_invoker>::run(); + // We all float down here + unittest::test_async_policy_overloads< + mixed_types_invoker>::run(); +} +DECLARE_UNITTEST(TestScanMixedTypes); + +#endif // C++14 diff --git a/testing/async_inclusive_scan.cu b/testing/async_inclusive_scan.cu new file mode 100644 index 0000000000..37786b2e99 --- /dev/null +++ b/testing/async_inclusive_scan.cu @@ -0,0 +1,108 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include + +#include + +#include +#include + +// TODO Finish implementing tests. Draw from other async algos, as well as +// the older scan tests. + +namespace +{ + +template +struct async_inclusive_scan_def +{ +public: + using input_type = thrust::device_vector; + using output_type = thrust::device_vector; + + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple> // - Non-default binary-op + >; + + static postfix_args_type generate_postfix_args() + { + return { + {}, // - no extra args + {thrust::maximum<>{}} // - non-default binary_op + }; + } + + static input_type generate_input() + { + input_type input(1024); + thrust::sequence(input.begin(), input.end(), 25, 3); + return input; + } + + template + static void invoke_reference(PostfixArgTuple &&postfix_tuple, + std::index_sequence, + input_type const &input, + output_type &output) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input); + thrust::host_vector host_output(input.size()); + + // Run host synchronous algorithm to generate reference. + thrust::inclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get( + THRUST_FWD(postfix_tuple))...); + + // Copy back to device. + output = host_output; + } + + template + static auto invoke_async(PrefixArgTuple &&prefix_tuple, + std::index_sequence, + PostfixArgTuple &&postfix_tuple, + std::index_sequence, + input_type const &input, + output_type &output) + { + output.resize(input.size()); + auto e = thrust::async::inclusive_scan( + std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } + + template + static void compare_outputs(EventType &e, + output_type const &ref, + output_type const &test) + { + TEST_EVENT_WAIT(e); + ASSERT_EQUAL_QUIET(ref, test); + } +}; + +} // namespace + +void TestPolicyOverloads() +{ + // Only ints are tested here because we just want to check that the policies + // are propagated correctly, so keep codegen to a minimum. + unittest::test_async_policy_overloads>::run(); +} +DECLARE_UNITTEST(TestPolicyOverloads); + +#endif // C++14 diff --git a/testing/unittest/util_async.h b/testing/unittest/util_async.h index 984cc61c6b..760457c089 100644 --- a/testing/unittest/util_async.h +++ b/testing/unittest/util_async.h @@ -3,28 +3,29 @@ #include #include -#if THRUST_CPP_DIALECT >= 2011 +#if THRUST_CPP_DIALECT >= 2014 #include +#include + #include -#define TEST_EVENT_WAIT(e) \ - ::unittest::test_event_wait(e, __FILE__, __LINE__) \ - /**/ +#include + +#define TEST_EVENT_WAIT(e) \ + ::unittest::test_event_wait(e, __FILE__, __LINE__) /**/ -#define TEST_FUTURE_VALUE_RETRIEVAL(f) \ - ::unittest::test_future_value_retrieval(f, __FILE__, __LINE__) \ - /**/ +#define TEST_FUTURE_VALUE_RETRIEVAL(f) \ + ::unittest::test_future_value_retrieval(f, __FILE__, __LINE__) /**/ namespace unittest { template -__host__ -void test_event_wait( - Event&& e, std::string const& filename = "unknown", int lineno = -1 -) +__host__ void test_event_wait(Event &&e, + std::string const &filename = "unknown", + int lineno = -1) { ASSERT_EQUAL_WITH_FILE_AND_LINE(true, e.valid_stream(), filename, lineno); @@ -36,10 +37,11 @@ void test_event_wait( } template -__host__ -auto test_future_value_retrieval( - Future&& f, std::string const& filename = "unknown", int lineno = -1 -) -> decltype(f.extract()) +__host__ auto test_future_value_retrieval(Future &&f, + std::string const &filename = "unknow" + "n", + int lineno = -1) + -> decltype(f.extract()) { ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_stream(), filename, lineno); ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_content(), filename, lineno); @@ -55,12 +57,11 @@ auto test_future_value_retrieval( auto const r2 = f.extract(); ASSERT_THROWS_EQUAL_WITH_FILE_AND_LINE( - auto x = f.extract(); - THRUST_UNUSED_VAR(x) - , thrust::event_error - , thrust::event_error(thrust::event_errc::no_content) - , filename, lineno - ); + auto x = f.extract(); THRUST_UNUSED_VAR(x), + thrust::event_error, + thrust::event_error(thrust::event_errc::no_content), + filename, + lineno); ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.ready(), filename, lineno); ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.valid_stream(), filename, lineno); @@ -71,7 +72,217 @@ auto test_future_value_retrieval( return r2; } -} // namespace unittest +// Tests that policies are handled correctly for all overloads of an async +// algorithm. +// +// Specifically, each overload is called with: +// 1) No policy +// 2) thrust::device +// 3) thrust::device(thrust::device_allocator) +// 4) thrust::device.on(stream) +// 5) thrust::device(thrust::device_allocator).on(stream) +// +// For each test, the returned event/future is tested to make sure it holds a +// reference to the proper stream. +// +// The AlgoDef type defines an async algorithm, its overloads, and abstracts its +// invocation. See the async_exclusive_scan.cu usage for a documented example of +// this interface. +template +struct test_async_policy_overloads +{ + using algo_def = AlgoDef; + using input_type = typename algo_def::input_type; + using output_type = typename algo_def::output_type; + using postfix_args_type = typename algo_def::postfix_args_type; + + // Main entry point; call this from a unit test function. + static void run() + { + // When a policy uses the default stream, the algorithm implementation + // should spawn a new stream in the returned event. This lambda validates + // this: + auto using_default_stream = [](auto &e) { + ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(), + e.stream().native_handle()); + }; + + // Lambda that verifies non-default streams are passed through to the + // event/future: + thrust::system::cuda::detail::unique_stream test_stream{}; + auto using_test_stream = [&test_stream](auto &e) { + ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle()); + }; + + // Test the different types of policies: + test_policy("(no policy)", std::make_tuple(), using_default_stream); + test_policy("thrust::device", + std::make_tuple(thrust::device), + using_default_stream); + test_policy("thrust::device(thrust::device_allocator{})", + std::make_tuple( + thrust::device(thrust::device_allocator{})), + using_default_stream); + test_policy("thrust::device.on(test_stream.get())", + std::make_tuple(thrust::device.on(test_stream.get())), + using_test_stream); + test_policy( + "thrust::device(thrust::device_allocator{}).on(test_stream.get())", + std::make_tuple( + thrust::device(thrust::device_allocator{}).on(test_stream.get())), + using_test_stream); + } + +private: + template + static void test_policy(std::string const &policy_desc, + PolicyTuple &&policy_tuple, + ValidateEvent &&validate) + { + constexpr std::size_t num_postfix_arg_sets = + std::tuple_size::value; + + // Test the current policy across all overloads of the algorithm: + iterate_postfix_args<0, num_postfix_arg_sets>{}( + policy_desc, + THRUST_FWD(policy_tuple), + algo_def::generate_postfix_args(), + THRUST_FWD(validate)); + } + + // Iterate through postfix arg sets, calling all overloads they define + // with the provided policy_tuple. + template + struct iterate_postfix_args + { + template + void operator()(std::string const &policy_desc, + PolicyTuple &&policy_tuple, + postfix_args_type &&postfix_args, + ValidateEvent &&validate) + { + try + { + test_configuration(policy_tuple, + std::get(std::move(postfix_args)), + validate); + } + catch (unittest::UnitTestException &exc) + { + // Append some identifying information to the exception: + std::string const overload_desc = unittest::demangle( + typeid(typename std::tuple_element::type) + .name()); + + std::string const input_desc = + unittest::demangle(typeid(input_type).name()); + std::string const output_desc = + unittest::demangle(typeid(output_type).name()); + + exc << "\n" + << " - testcase = " << algo_def::description() << "\n" + << " - policy = " << policy_desc << "\n" + << " - input_type = " << input_desc << "\n" + << " - output_type = " << output_desc<< "\n" + << " - tuple of trailing arguments = " << overload_desc; + throw; + } -#endif // THRUST_CPP_DIALECT >= 2011 + // Recurse + iterate_postfix_args{}( + policy_desc, + THRUST_FWD(policy_tuple), + std::move(postfix_args), + THRUST_FWD(validate)); + } + }; + + // Terminal specialization + template + struct iterate_postfix_args + { + template + void operator()(Ts &&...) + {} + }; + + // Actually invoke the algorithms with the supplied prefix/postfix args + // and do the validations: + template + static void test_configuration(PrefixArgTuple &&prefix_tuple_ref, + PostfixArgTuple &&postfix_tuple_ref, + ValidateEvent const &validate) + { + using prefix_tuple_type = thrust::remove_cvref_t; + using postfix_tuple_type = thrust::remove_cvref_t; + + // Sink these tuples into const locals so they can be safely passed to + // multiple invocations without worrying about potential modifications. + prefix_tuple_type const prefix_tuple = THRUST_FWD(prefix_tuple_ref); + postfix_tuple_type const postfix_tuple = THRUST_FWD(postfix_tuple_ref); + + // Generate index sequences for the tuples: + constexpr auto prefix_tuple_size = std::tuple_size{}; + constexpr auto postfix_tuple_size = std::tuple_size{}; + using prefix_index_seq = std::make_index_sequence; + using postfix_index_seq = std::make_index_sequence; + + input_type const input = algo_def::generate_input(); + + output_type test_output_a = algo_def::generate_output(); + output_type test_output_b = algo_def::generate_output(); + output_type test_output_c = algo_def::generate_output(); + output_type test_output_d = algo_def::generate_output(); + + // Invoke multiple overlapping async algorithms, capturing their outputs + // and events/futures: + auto e_a = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input, + test_output_a, + postfix_tuple, + postfix_index_seq{}); + auto e_b = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input, + test_output_b, + postfix_tuple, + postfix_index_seq{}); + auto e_c = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input, + test_output_c, + postfix_tuple, + postfix_index_seq{}); + auto e_d = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input, + test_output_d, + postfix_tuple, + postfix_index_seq{}); + + // Let reference calc overlap with async testing: + output_type ref_output = algo_def::generate_output(); + algo_def::invoke_reference(input, + ref_output, + postfix_tuple, + postfix_index_seq{}); + + algo_def::compare_outputs(e_a, ref_output, test_output_a); + algo_def::compare_outputs(e_b, ref_output, test_output_b); + algo_def::compare_outputs(e_c, ref_output, test_output_c); + algo_def::compare_outputs(e_d, ref_output, test_output_d); + + validate(e_a); + validate(e_b); + validate(e_c); + validate(e_d); + } +}; + +} // namespace unittest +#endif // THRUST_CPP_DIALECT >= 2014 diff --git a/thrust/async/scan.h b/thrust/async/scan.h new file mode 100644 index 0000000000..d7adfac983 --- /dev/null +++ b/thrust/async/scan.h @@ -0,0 +1,355 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! \file async/scan.h + * \brief Functions for asynchronously computing prefix scans. + */ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include +#include + +#include + +#include +#include +#include + +#include + +namespace thrust +{ + +namespace async +{ + +// Fallback implementations used when no overloads are found via ADL: +namespace unimplemented +{ + +template +event +async_inclusive_scan(thrust::execution_policy&, + ForwardIt, + Sentinel, + OutputIt, + BinaryOp) +{ + THRUST_STATIC_ASSERT_MSG( + (thrust::detail::depend_on_instantiation::value), + "this algorithm is not implemented for the specified system" + ); + return {}; +} + +template +event +async_exclusive_scan(thrust::execution_policy&, + ForwardIt, + Sentinel, + OutputIt, + InitialValueType, + BinaryOp) +{ + THRUST_STATIC_ASSERT_MSG( + (thrust::detail::depend_on_instantiation::value), + "this algorithm is not implemented for the specified system" + ); + return {}; +} + +} // namespace unimplemented + +namespace inclusive_scan_detail +{ + +// Include fallback implementation for ADL failures +using thrust::async::unimplemented::async_inclusive_scan; + +// Implementation of the thrust::async::inclusive_scan CPO. +struct inclusive_scan_fn final +{ + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(op) + ) + ) + + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + thrust::plus<>{} + ) + ) + + template >>> + THRUST_NODISCARD + auto operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(op) + ) + ) + + template + THRUST_NODISCARD + auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + thrust::plus<>{} + ) + ) +}; + +} // namespace inclusive_scan_detail + +THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{}; + +namespace exclusive_scan_detail +{ + +// Include fallback implementation for ADL failures +using thrust::async::unimplemented::async_exclusive_scan; + +// Implementation of the thrust::async::exclusive_scan CPO. +struct exclusive_scan_fn final +{ + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) + ) + + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + thrust::plus<>{} + ) + ) + + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + iterator_value_t>{}, + thrust::plus<>{} + ) + ) + + template >>> + THRUST_NODISCARD + auto + operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) + ) + + template >>> + THRUST_NODISCARD + auto + operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + thrust::plus<>{} + ) + ) + + template + THRUST_NODISCARD + auto operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + iterator_value_t>{}, + thrust::plus<>{} + ) + ) +}; + +} // namespace exclusive_scan_detail + +THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{}; + +} // namespace async + +} // end namespace thrust + +#endif diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h new file mode 100644 index 0000000000..022b7177f4 --- /dev/null +++ b/thrust/system/cuda/detail/async/exclusive_scan.h @@ -0,0 +1,180 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +// TODO specialize for thrust::plus to use e.g. ExclusiveSum instead of ExcScan +// - Note that thrust::plus<> is transparent, cub::Sum is not. This should be +// fixed in CUB first). +// - Need to check if CUB actually optimizes for sums before putting in effort + +namespace thrust +{ +namespace system +{ +namespace cuda +{ +namespace detail +{ + +template +auto async_exclusive_scan_n(execution_policy& policy, + ForwardIt first, + Size n, + OutputIt out, + InitialValueType init, + BinaryOp op) + -> unique_eager_event +{ + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + size_t tmp_size = 0; + thrust::cuda_cub::throw_on_error( + cub::DeviceScan::ExclusiveScan( + nullptr, + tmp_size, + first, + out, + op, + init, + n, + nullptr, // Null stream, just for sizing. + THRUST_DEBUG_SYNC_FLAG + ), + "after exclusive_scan sizing" + ); + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n( + device_alloc, tmp_size + ); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple( + std::move(content), + unique_stream(nonowning, user_raw_stream) + ), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple(std::move(content)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + thrust::cuda_cub::throw_on_error( + cub::DeviceScan::ExclusiveScan( + tmp_ptr, + tmp_size, + first, + out, + op, + init, + n, + ev.stream().native_handle(), + THRUST_DEBUG_SYNC_FLAG + ), + "after exclusive_scan launch" + ); + + return std::move(ev); +} + +}}} // namespace system::cuda::detail + +namespace cuda_cub +{ + +// ADL entry point. +template +auto async_exclusive_scan(execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + InitialValueType &&init, + BinaryOp&& op) +THRUST_RETURNS( + thrust::system::cuda::detail::async_exclusive_scan_n( + policy, + first, + distance(first, THRUST_FWD(last)), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) +) + +} // cuda_cub + +} // end namespace thrust + +#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#endif + diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h new file mode 100644 index 0000000000..5f37c19cff --- /dev/null +++ b/thrust/system/cuda/detail/async/inclusive_scan.h @@ -0,0 +1,173 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +// TODO specialize for thrust::plus to use e.g. InclusiveSum instead of IncScan +// - Note that thrust::plus<> is transparent, cub::Sum is not. This should be +// fixed in CUB first). +// - Need to check if CUB actually optimizes for sums before putting in effort + +namespace thrust +{ +namespace system +{ +namespace cuda +{ +namespace detail +{ + +template +auto async_inclusive_scan_n(execution_policy& policy, + ForwardIt first, + Size n, + OutputIt out, + BinaryOp op) + -> unique_eager_event +{ + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + size_t tmp_size = 0; + thrust::cuda_cub::throw_on_error( + cub::DeviceScan::InclusiveScan( + nullptr, + tmp_size, + first, + out, + op, + n, + nullptr, // Null stream, just for sizing. + THRUST_DEBUG_SYNC_FLAG + ), + "after inclusive_scan sizing" + ); + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n( + device_alloc, tmp_size + ); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple( + std::move(content), + unique_stream(nonowning, user_raw_stream) + ), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple(std::move(content)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + thrust::cuda_cub::throw_on_error( + cub::DeviceScan::InclusiveScan( + tmp_ptr, + tmp_size, + first, + out, + op, + n, + ev.stream().native_handle(), + THRUST_DEBUG_SYNC_FLAG + ), + "after inclusive_scan launch" + ); + + return std::move(ev); +} + +}}} // namespace system::cuda::detail + +namespace cuda_cub +{ + +// ADL entry point. +template +auto async_inclusive_scan(execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) +THRUST_RETURNS( + thrust::system::cuda::detail::async_inclusive_scan_n( + policy, + first, + distance(first, THRUST_FWD(last)), + THRUST_FWD(out), + THRUST_FWD(op) + ) +) + +} // cuda_cub + +} // end namespace thrust + +#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#endif + diff --git a/thrust/system/cuda/detail/async/scan.h b/thrust/system/cuda/detail/async/scan.h new file mode 100644 index 0000000000..7d993e6641 --- /dev/null +++ b/thrust/system/cuda/detail/async/scan.h @@ -0,0 +1,33 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +#include +#include diff --git a/thrust/system/detail/adl/async/scan.h b/thrust/system/detail/adl/async/scan.h new file mode 100644 index 0000000000..a2a90618b4 --- /dev/null +++ b/thrust/system/detail/adl/async/scan.h @@ -0,0 +1,34 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The purpose of this header is to #include the async/scan.h header of the +// sequential, host, and device systems. It should be #included in any code +// which uses ADL to dispatch async scans. + +#pragma once + +#include + +//#include + +//#define __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/scan.h> +//#include __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER +//#undef __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER + +#define __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/scan.h> +#include __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER +#undef __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER + diff --git a/thrust/system/detail/generic/scan.inl b/thrust/system/detail/generic/scan.inl index 300b697b26..83d272c3e9 100644 --- a/thrust/system/detail/generic/scan.inl +++ b/thrust/system/detail/generic/scan.inl @@ -61,9 +61,7 @@ __host__ __device__ { // Use the input iterator's value type per https://wg21.link/P0571 using ValueType = typename thrust::iterator_value::type; - - // assume 0 as the initialization value - return thrust::exclusive_scan(exec, first, last, result, ValueType(0)); + return thrust::exclusive_scan(exec, first, last, result, ValueType{}); } // end exclusive_scan()