From 87ae049378eed3120c19924bcc3f2cec2fc1d90f Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Fri, 31 Jul 2020 17:22:32 -0400
Subject: [PATCH] WIP Adding async scan algorithms.

---
 dependencies/cub                              |   2 +-
 testing/CMakeLists.txt                        |   2 +
 testing/async_exclusive_scan.cu               | 424 ++++++++++++++++++
 testing/async_inclusive_scan.cu               | 108 +++++
 testing/unittest/util_async.h                 | 257 ++++++++++-
 thrust/async/scan.h                           | 355 +++++++++++++++
 .../system/cuda/detail/async/exclusive_scan.h | 180 ++++++++
 .../system/cuda/detail/async/inclusive_scan.h | 173 +++++++
 thrust/system/cuda/detail/async/scan.h        |  33 ++
 thrust/system/detail/adl/async/scan.h         |  34 ++
 thrust/system/detail/generic/scan.inl         |   4 +-
 11 files changed, 1545 insertions(+), 27 deletions(-)
 create mode 100644 testing/async_exclusive_scan.cu
 create mode 100644 testing/async_inclusive_scan.cu
 create mode 100644 thrust/async/scan.h
 create mode 100644 thrust/system/cuda/detail/async/exclusive_scan.h
 create mode 100644 thrust/system/cuda/detail/async/inclusive_scan.h
 create mode 100644 thrust/system/cuda/detail/async/scan.h
 create mode 100644 thrust/system/detail/adl/async/scan.h
diff --git a/dependencies/cub b/dependencies/cub
index 2749cb0c7b..99ac8fe71e 160000
--- a/dependencies/cub
+++ b/dependencies/cub
@@ -1 +1 @@
-Subproject commit 2749cb0c7bc5a72c806d7ca0b8e4d702dbe017e5
+Subproject commit 99ac8fe71e937d57c594c38fe2d3ee9fd99e1a1b
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index fdfc04e97b..ca6ed63247 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -20,9 +20,11 @@ add_subdirectory(unittest)
 # List of tests that aren't implemented for all backends, but are implemented for CUDA.
 set(partially_implemented_CUDA
   async_copy
+  async_exclusive_scan
   async_for_each
   async_reduce
   async_reduce_into
+  async_inclusive_scan
   async_sort
   async_transform
   event
diff --git a/testing/async_exclusive_scan.cu b/testing/async_exclusive_scan.cu
new file mode 100644
index 0000000000..47b1d6d168
--- /dev/null
+++ b/testing/async_exclusive_scan.cu
@@ -0,0 +1,424 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/async/scan.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <sstream>
+#include <string>
+
+// TODO Finish implementing tests. Draw from other async algos, as well as
+// the older scan tests.
+
+// X Add a description string method to AlgoDef and add it to any thrown
+//   exceptions
+// - Change the num_values NTTP to a runtime variable for better integration
+//   with the existing framework.
+// - For a large number of values, the test will fail for floating point numbers
+//   because we're comparing the cpp and cuda/async backends and fp adds are
+//   non-associative.
+//   - CUB's scans guarantee deterministic results, but only when on the exact
+//     same device.
+//   - Don't want to test against synch cuda backend, that'll just compare
+//     cub::DeviceScan to cub::DeviceScan.
+//   - Quick test using fuzzy compare for FP types had issues once
+//     custom_numeric was shown to an ASSERT_ALMOST_EQUAL.
+//   - Should probably fix ASSERT_ALMOST_EQUAL to work with custom_numeric.
+
+// - Build def using mixins for e.g. counting_iterator input, discard outputs,
+//   - need a hook to assemble output objects
+//     - Rename generate_input() to initialize_input()
+//     - Add initialize_output()
+// - Run the various def assemblies through tests of:
+//   - Different input/output combinations (e.g. int -> float, etc)
+//     - TestScanMixedTypes
+//     - Validate intermediate types, casting order, etc
+//     - With various initial value specification types, too.
+//   - Custom binary_operator
+//     - Should just an additional entry in postfix_args
+//   - In-place execution
+
+// From testing/scan.cu:
+//   - TestInclusiveScanDispatchExplicit
+//   - TestInclusiveScanDispatchImplicit
+//   - TestScanWithDiscardIterator
+//   - TestScanWithLargeTypes
+//   - TestInclusiveScanWithBigIndexes
+//   - TestInclusiveScanWithUserDefinedType
+
+// From testing/async_reduce.cu:
+//   - test_async_reduce
+//   - test_async_reduce_counting_iterator
+//   - test_async_reduce_counting_iterator
+//   - test_async_reduce_using (wtf sometimes I hate c++ so much)
+//   - test_async_reduce_after (can this be worked into the framework?)
+//   - test_async_reduce_on_then_after (can this be worked into the framework?)
+//     - all the child variants (e.g. with allocator) too
+//   - test_async_reduce_caching
+//   - test_async_copy_then_reduce
+
+// template <typename value_type>
+// struct async_exclusive_scan_def
+//{
+// public:
+//  // Input and output types for the algorithms:
+//  using input_type  = thrust::device_vector<value_type>;
+//  using output_type = thrust::device_vector<value_type>;
+//
+//  using postfix_args_type = std::tuple<       // List any extra arg overloads:
+//    std::tuple<>,                             // - no extra args
+//    std::tuple<value_type>,                   // - initial_value
+//    std::tuple<value_type, thrust::maximum<>> // - initial_value, binary_op
+//    >;
+//
+//  // Create instances of the extra arguments to use when invoking the
+//  // algorithm:
+//  static postfix_args_type generate_postfix_args()
+//  {
+//    return {
+//      {},                       // no extra args
+//      {42},                     // initial_value
+//      {57, thrust::maximum<>{}} // initial_value, binary_op
+//    };
+//  }
+//
+//  // Generate an instance of the input:
+//  static input_type generate_input()
+//  {
+//    input_type input(1024);
+//    thrust::sequence(input.begin(), input.end(), 25, 3);
+//    return input;
+//  }
+//
+//  // Generate an instance of the input:
+//  // Might be more complicated, eg. fancy iterators, etc
+//  static output_type generate_output() { return output_type{}; }
+//
+//  // Invoke a reference implementation for a single overload as described by
+//  // postfix_tuple. This tuple contains instances of any additional arguments
+//  // to pass to the algorithm. The tuple/index_sequence pattern is used to
+//  // support the "no extra args" overload, since the parameter pack expansion
+//  // will do exactly what we want in all cases.
+//  template <typename PostfixArgTuple, std::size_t... PostfixArgIndices>
+//  static void invoke_reference(input_type const &input,
+//                               output_type &output,
+//                               PostfixArgTuple &&postfix_tuple,
+//                               std::index_sequence<PostfixArgIndices...>)
+//  {
+//    // Create host versions of the input/output:
+//    thrust::host_vector<value_type> host_input(input);
+//    thrust::host_vector<value_type> host_output(input.size());
+//
+//    // Run host synchronous algorithm to generate reference.
+//    thrust::exclusive_scan(host_input.cbegin(),
+//                           host_input.cend(),
+//                           host_output.begin(),
+//                           std::get<PostfixArgIndices>(
+//                             THRUST_FWD(postfix_tuple))...);
+//
+//    // Copy back to device.
+//    output = host_output;
+//  }
+//
+//  // Invoke the async algorithm for a single overload as described by
+//  // the prefix and postfix tuples. These tuples contains instances of any
+//  // additional arguments to pass to the algorithm. The tuple/index_sequence
+//  // pattern is used to support the "no extra args" overload, since the
+//  // parameter pack expansion will do exactly what we want in all cases.
+//  // Prefix args are included here (but not for invoke_reference) to allow the
+//  // test framework to change the execution policy.
+//  // This method must return an event or future.
+//  template <typename PrefixArgTuple,
+//            std::size_t... PrefixArgIndices,
+//            typename PostfixArgTuple,
+//            std::size_t... PostfixArgIndices>
+//  static auto invoke_async(PrefixArgTuple &&prefix_tuple,
+//                           std::index_sequence<PrefixArgIndices...>,
+//                           input_type const &input,
+//                           output_type &output,
+//                           PostfixArgTuple &&postfix_tuple,
+//                           std::index_sequence<PostfixArgIndices...>)
+//  {
+//    output.resize(input.size());
+//    auto e = thrust::async::exclusive_scan(
+//      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+//      input.cbegin(),
+//      input.cend(),
+//      output.begin(),
+//      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+//    return e;
+//  }
+//
+//  // Wait on and validate the event/future (usually with TEST_EVENT_WAIT /
+//  // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output
+//  matches
+//  // the testing output.
+//  template <typename EventType>
+//  static void compare_outputs(EventType &e,
+//                              output_type const &ref,
+//                              output_type const &test)
+//  {
+//    TEST_EVENT_WAIT(e);
+//    ASSERT_EQUAL_QUIET(ref, test);
+//  }
+//};
+
+// Trying to find a sensible decomposition of the above into mixins that
+// can be reused / worked into the framework.
+
+namespace mixin
+{
+
+namespace input
+{
+
+// TODO it'd be nice to specify a lambda expression to replace the call to
+// thrust::sequence when needed.
+template <typename value_type, size_t num_values = 1024 * 1024>
+struct device_vector
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input()
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     // fractional values are chosen for test ScanMixedTypes:
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+} // namespace input
+
+namespace output
+{
+
+template <typename value_type, size_t num_values = 1024 * 1024>
+struct device_vector
+{
+  using output_type = thrust::device_vector<value_type>;
+
+  static output_type generate_output() { return output_type(num_values); }
+};
+
+} // namespace output
+
+namespace postfix_args
+{
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct all_overloads
+{
+  using postfix_args_type = std::tuple<         // List any extra arg overloads:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, alternate_binary_op> // - initial_value, binary_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{}, {42}, {42, alternate_binary_op{}}};
+  }
+};
+
+// Used by ScanMixedTypes. A fractional value is used to ensure that a different
+// result is obtained when using float vs. int.
+template <typename value_type>
+struct exclusive_scan_mixed_types_overloads
+{
+  using postfix_args_type = std::tuple<         // Overloads to test:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, thrust::plus<>>,     // - initial_value, plus<>
+    std::tuple<value_type, thrust::plus<int>>,  // - initial_value, plus<int>
+    std::tuple<value_type, thrust::plus<float>> // - initial_value, plus<float>
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{},
+            {static_cast<value_type>(5.5)},
+            {static_cast<value_type>(5.5), thrust::plus<>{}},
+            {static_cast<value_type>(5.5), thrust::plus<int>{}},
+            {static_cast<value_type>(5.5), thrust::plus<float>{}}};
+  }
+};
+
+} // namespace postfix_args
+
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const &input,
+                               OutputType &output,
+                               PostfixArgTuple &&postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input);
+    thrust::host_vector<output_value_type> host_output(input.size());
+
+    // Run host synchronous algorithm to generate reference.
+    thrust::exclusive_scan(host_input.cbegin(),
+                           host_input.cend(),
+                           host_output.begin(),
+                           std::get<PostfixArgIndices>(
+                             THRUST_FWD(postfix_tuple))...);
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+namespace invoke_async
+{
+
+struct basic
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple &&prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const &input,
+                           OutputType &output,
+                           PostfixArgTuple &&postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    auto e = thrust::async::exclusive_scan(
+      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+      input.cbegin(),
+      input.cend(),
+      output.begin(),
+      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+namespace compare_outputs
+{
+
+struct assert_equal_quiet
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType &e,
+                              OutputType const &ref,
+                              OutputType const &test)
+  {
+    TEST_EVENT_WAIT(e);
+    ASSERT_EQUAL(ref, test);
+  }
+};
+
+} // namespace compare_outputs
+
+} // namespace mixin
+
+template <typename input_value_type,
+          typename output_value_type      = input_value_type,
+          typename initial_value_type     = input_value_type,
+          typename std::size_t num_values = 1024,
+          typename alternate_binary_op    = thrust::maximum<>>
+struct basic_invoker
+    : mixin::input::device_vector<input_value_type, num_values>
+    , mixin::output::device_vector<output_value_type, num_values>
+    , mixin::postfix_args::all_overloads<initial_value_type, alternate_binary_op>
+    , mixin::invoke_reference::host_synchronous<input_value_type,
+                                                output_value_type>
+    , mixin::invoke_async::basic
+    , mixin::compare_outputs::assert_equal_quiet
+{
+  static std::string description()
+  {
+    return "basic invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct TestBasic
+{
+  void operator()() const
+  {
+    unittest::test_async_policy_overloads<basic_invoker<T>>::run();
+  }
+};
+// TODO GENERIC_SIZED_UNITTEST:
+DECLARE_GENERIC_UNITTEST_WITH_TYPES(TestBasic, NumericTypes);
+
+template <typename input_value_type,
+          typename output_value_type,
+          typename initial_value_type,
+          std::size_t num_values = 10>
+struct mixed_types_invoker
+    : mixin::input::device_vector<input_value_type, num_values>
+    , mixin::output::device_vector<output_value_type, num_values>
+    , mixin::postfix_args::exclusive_scan_mixed_types_overloads<
+        initial_value_type>
+    , mixin::invoke_reference::host_synchronous<input_value_type,
+                                                output_value_type>
+    , mixin::invoke_async::basic
+    , mixin::compare_outputs::assert_equal_quiet
+{
+  static std::string description()
+  {
+    return "mixed input/output/initial type tests";
+  }
+};
+
+void TestScanMixedTypes()
+{
+  // Test using mixed int/float types for:
+  // - input_value_type       | (int, float)
+  // - output_value_type      | (int, float)
+  // - initial_value_type     | (int, float, <none>)
+  // - thrust::plus<T> T-type | (int, float, void>)
+  //
+  // The initial_value_type and thrust::plus<T> types are covered by the
+  // mixin::postfix_args::scan_mixed_types_overloads component.
+
+  // invoker template params are input_value_type, output_vt, initial_vt:
+  unittest::test_async_policy_overloads<
+    mixed_types_invoker<int, int, int>>::run();
+  unittest::test_async_policy_overloads<
+    mixed_types_invoker<int, int, float>>::run();
+  unittest::test_async_policy_overloads<
+    mixed_types_invoker<int, float, int>>::run();
+  unittest::test_async_policy_overloads<
+    mixed_types_invoker<int, float, float>>::run();
+  unittest::test_async_policy_overloads<
+    mixed_types_invoker<float, int, int>>::run();
+  unittest::test_async_policy_overloads<
+    mixed_types_invoker<float, int, float>>::run();
+  unittest::test_async_policy_overloads<
+    mixed_types_invoker<float, float, int>>::run();
+  // We all float down here
+  unittest::test_async_policy_overloads<
+    mixed_types_invoker<float, float, float>>::run();
+}
+DECLARE_UNITTEST(TestScanMixedTypes);
+
+#endif // C++14
diff --git a/testing/async_inclusive_scan.cu b/testing/async_inclusive_scan.cu
new file mode 100644
index 0000000000..37786b2e99
--- /dev/null
+++ b/testing/async_inclusive_scan.cu
@@ -0,0 +1,108 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/async/scan.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+// TODO Finish implementing tests. Draw from other async algos, as well as
+// the older scan tests.
+
+namespace
+{
+
+template <typename value_type>
+struct async_inclusive_scan_def
+{
+public:
+  using input_type  = thrust::device_vector<value_type>;
+  using output_type = thrust::device_vector<value_type>;
+
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>,                       // - no extra args
+    std::tuple<thrust::maximum<>>       // - Non-default binary-op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {
+      {},                   // - no extra args
+      {thrust::maximum<>{}} // - non-default binary_op
+    };
+  }
+
+  static input_type generate_input()
+  {
+    input_type input(1024);
+    thrust::sequence(input.begin(), input.end(), 25, 3);
+    return input;
+  }
+
+  template <typename PostfixArgTuple, std::size_t... PostfixArgIndices>
+  static void invoke_reference(PostfixArgTuple &&postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>,
+                               input_type const &input,
+                               output_type &output)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<value_type> host_input(input);
+    thrust::host_vector<value_type> host_output(input.size());
+
+    // Run host synchronous algorithm to generate reference.
+    thrust::inclusive_scan(host_input.cbegin(),
+                           host_input.cend(),
+                           host_output.begin(),
+                           std::get<PostfixArgIndices>(
+                             THRUST_FWD(postfix_tuple))...);
+
+    // Copy back to device.
+    output = host_output;
+  }
+
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple &&prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           PostfixArgTuple &&postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>,
+                           input_type const &input,
+                           output_type &output)
+  {
+    output.resize(input.size());
+    auto e = thrust::async::inclusive_scan(
+      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+      input.cbegin(),
+      input.cend(),
+      output.begin(),
+      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+
+  template <typename EventType>
+  static void compare_outputs(EventType &e,
+                              output_type const &ref,
+                              output_type const &test)
+  {
+    TEST_EVENT_WAIT(e);
+    ASSERT_EQUAL_QUIET(ref, test);
+  }
+};
+
+} // namespace
+
+void TestPolicyOverloads()
+{
+  // Only ints are tested here because we just want to check that the policies
+  // are propagated correctly, so keep codegen to a minimum.
+  unittest::test_async_policy_overloads<async_inclusive_scan_def<int>>::run();
+}
+DECLARE_UNITTEST(TestPolicyOverloads);
+
+#endif // C++14
diff --git a/testing/unittest/util_async.h b/testing/unittest/util_async.h
index 984cc61c6b..760457c089 100644
--- a/testing/unittest/util_async.h
+++ b/testing/unittest/util_async.h
@@ -3,28 +3,29 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/cpp11_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 
+#include <thrust/device_allocator.h>
+
 #include <thrust/future.h>
 
-#define TEST_EVENT_WAIT(e)                                                    \
-  ::unittest::test_event_wait(e, __FILE__, __LINE__)                          \
-  /**/
+#include <string>
+
+#define TEST_EVENT_WAIT(e)                                                     \
+  ::unittest::test_event_wait(e, __FILE__, __LINE__) /**/
 
-#define TEST_FUTURE_VALUE_RETRIEVAL(f)                                        \
-  ::unittest::test_future_value_retrieval(f, __FILE__, __LINE__)              \
-  /**/
+#define TEST_FUTURE_VALUE_RETRIEVAL(f)                                         \
+  ::unittest::test_future_value_retrieval(f, __FILE__, __LINE__) /**/
 
 namespace unittest
 {
 
 template <typename Event>
-__host__
-void test_event_wait(
-  Event&& e, std::string const& filename = "unknown", int lineno = -1
-)
+__host__ void test_event_wait(Event &&e,
+                              std::string const &filename = "unknown",
+                              int lineno                  = -1)
 {
   ASSERT_EQUAL_WITH_FILE_AND_LINE(true, e.valid_stream(), filename, lineno);
 
@@ -36,10 +37,11 @@ void test_event_wait(
 }
 
 template <typename Future>
-__host__
-auto test_future_value_retrieval(
-  Future&& f, std::string const& filename = "unknown", int lineno = -1
-) -> decltype(f.extract())
+__host__ auto test_future_value_retrieval(Future &&f,
+                                          std::string const &filename = "unknow"
+                                                                        "n",
+                                          int lineno = -1)
+  -> decltype(f.extract())
 {
   ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_stream(), filename, lineno);
   ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_content(), filename, lineno);
@@ -55,12 +57,11 @@ auto test_future_value_retrieval(
   auto const r2 = f.extract();
 
   ASSERT_THROWS_EQUAL_WITH_FILE_AND_LINE(
-    auto x = f.extract();
-    THRUST_UNUSED_VAR(x)
-  , thrust::event_error
-  , thrust::event_error(thrust::event_errc::no_content)
-  , filename, lineno
-  );
+    auto x = f.extract(); THRUST_UNUSED_VAR(x),
+                          thrust::event_error,
+                          thrust::event_error(thrust::event_errc::no_content),
+                          filename,
+                          lineno);
 
   ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.ready(), filename, lineno);
   ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.valid_stream(), filename, lineno);
@@ -71,7 +72,217 @@ auto test_future_value_retrieval(
   return r2;
 }
 
-} // namespace unittest
+// Tests that policies are handled correctly for all overloads of an async
+// algorithm.
+//
+// Specifically, each overload is called with:
+// 1) No policy
+// 2) thrust::device
+// 3) thrust::device(thrust::device_allocator<void>)
+// 4) thrust::device.on(stream)
+// 5) thrust::device(thrust::device_allocator<void>).on(stream)
+//
+// For each test, the returned event/future is tested to make sure it holds a
+// reference to the proper stream.
+//
+// The AlgoDef type defines an async algorithm, its overloads, and abstracts its
+// invocation. See the async_exclusive_scan.cu usage for a documented example of
+// this interface.
+template <typename AlgoDef>
+struct test_async_policy_overloads
+{
+  using algo_def          = AlgoDef;
+  using input_type        = typename algo_def::input_type;
+  using output_type       = typename algo_def::output_type;
+  using postfix_args_type = typename algo_def::postfix_args_type;
+
+  // Main entry point; call this from a unit test function.
+  static void run()
+  {
+    // When a policy uses the default stream, the algorithm implementation
+    // should spawn a new stream in the returned event. This lambda validates
+    // this:
+    auto using_default_stream = [](auto &e) {
+      ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(),
+                       e.stream().native_handle());
+    };
+
+    // Lambda that verifies non-default streams are passed through to the
+    // event/future:
+    thrust::system::cuda::detail::unique_stream test_stream{};
+    auto using_test_stream = [&test_stream](auto &e) {
+      ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle());
+    };
+
+    // Test the different types of policies:
+    test_policy("(no policy)", std::make_tuple(), using_default_stream);
+    test_policy("thrust::device",
+                std::make_tuple(thrust::device),
+                using_default_stream);
+    test_policy("thrust::device(thrust::device_allocator<void>{})",
+                std::make_tuple(
+                  thrust::device(thrust::device_allocator<void>{})),
+                using_default_stream);
+    test_policy("thrust::device.on(test_stream.get())",
+                std::make_tuple(thrust::device.on(test_stream.get())),
+                using_test_stream);
+    test_policy(
+      "thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())",
+      std::make_tuple(
+        thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())),
+      using_test_stream);
+  }
+
+private:
+  template <typename PolicyTuple, typename ValidateEvent>
+  static void test_policy(std::string const &policy_desc,
+                          PolicyTuple &&policy_tuple,
+                          ValidateEvent &&validate)
+  {
+    constexpr std::size_t num_postfix_arg_sets =
+      std::tuple_size<postfix_args_type>::value;
+
+    // Test the current policy across all overloads of the algorithm:
+    iterate_postfix_args<0, num_postfix_arg_sets>{}(
+      policy_desc,
+      THRUST_FWD(policy_tuple),
+      algo_def::generate_postfix_args(),
+      THRUST_FWD(validate));
+  }
+
+  // Iterate through postfix arg sets, calling all overloads they define
+  // with the provided policy_tuple.
+  template <size_t PostfixArgSetIdx, size_t NumPostfixArgSets>
+  struct iterate_postfix_args
+  {
+    template <typename PolicyTuple, typename ValidateEvent>
+    void operator()(std::string const &policy_desc,
+                    PolicyTuple &&policy_tuple,
+                    postfix_args_type &&postfix_args,
+                    ValidateEvent &&validate)
+    {
+      try
+      {
+        test_configuration(policy_tuple,
+                           std::get<PostfixArgSetIdx>(std::move(postfix_args)),
+                           validate);
+      }
+      catch (unittest::UnitTestException &exc)
+      {
+        // Append some identifying information to the exception:
+        std::string const overload_desc = unittest::demangle(
+          typeid(typename std::tuple_element<PostfixArgSetIdx,
+                                             postfix_args_type>::type)
+            .name());
+
+        std::string const input_desc =
+          unittest::demangle(typeid(input_type).name());
+        std::string const output_desc =
+          unittest::demangle(typeid(output_type).name());
+
+        exc << "\n"
+            << " - testcase = " << algo_def::description() << "\n"
+            << " - policy = " << policy_desc << "\n"
+            << " - input_type = " << input_desc << "\n"
+            << " - output_type = " << output_desc<< "\n"
+            << " - tuple of trailing arguments = " << overload_desc;
+        throw;
+      }
 
-#endif // THRUST_CPP_DIALECT >= 2011
+      // Recurse
+      iterate_postfix_args<PostfixArgSetIdx + 1, NumPostfixArgSets>{}(
+        policy_desc,
+        THRUST_FWD(policy_tuple),
+        std::move(postfix_args),
+        THRUST_FWD(validate));
+    }
+  };
+
+  // Terminal specialization
+  template <size_t N>
+  struct iterate_postfix_args<N, N>
+  {
+    template <typename... Ts>
+    void operator()(Ts &&...)
+    {}
+  };
+
+  // Actually invoke the algorithms with the supplied prefix/postfix args
+  // and do the validations:
+  template <typename PrefixArgTuple,
+            typename PostfixArgTuple,
+            typename ValidateEvent>
+  static void test_configuration(PrefixArgTuple &&prefix_tuple_ref,
+                                 PostfixArgTuple &&postfix_tuple_ref,
+                                 ValidateEvent const &validate)
+  {
+    using prefix_tuple_type  = thrust::remove_cvref_t<PrefixArgTuple>;
+    using postfix_tuple_type = thrust::remove_cvref_t<PostfixArgTuple>;
+
+    // Sink these tuples into const locals so they can be safely passed to
+    // multiple invocations without worrying about potential modifications.
+    prefix_tuple_type const prefix_tuple   = THRUST_FWD(prefix_tuple_ref);
+    postfix_tuple_type const postfix_tuple = THRUST_FWD(postfix_tuple_ref);
+
+    // Generate index sequences for the tuples:
+    constexpr auto prefix_tuple_size  = std::tuple_size<prefix_tuple_type>{};
+    constexpr auto postfix_tuple_size = std::tuple_size<postfix_tuple_type>{};
+    using prefix_index_seq  = std::make_index_sequence<prefix_tuple_size>;
+    using postfix_index_seq = std::make_index_sequence<postfix_tuple_size>;
+
+    input_type const input = algo_def::generate_input();
+
+    output_type test_output_a = algo_def::generate_output();
+    output_type test_output_b = algo_def::generate_output();
+    output_type test_output_c = algo_def::generate_output();
+    output_type test_output_d = algo_def::generate_output();
+
+    // Invoke multiple overlapping async algorithms, capturing their outputs
+    // and events/futures:
+    auto e_a = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input,
+                                      test_output_a,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_b = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input,
+                                      test_output_b,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_c = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input,
+                                      test_output_c,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_d = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input,
+                                      test_output_d,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+
+    // Let reference calc overlap with async testing:
+    output_type ref_output = algo_def::generate_output();
+    algo_def::invoke_reference(input,
+                               ref_output,
+                               postfix_tuple,
+                               postfix_index_seq{});
+
+    algo_def::compare_outputs(e_a, ref_output, test_output_a);
+    algo_def::compare_outputs(e_b, ref_output, test_output_b);
+    algo_def::compare_outputs(e_c, ref_output, test_output_c);
+    algo_def::compare_outputs(e_d, ref_output, test_output_d);
+
+    validate(e_a);
+    validate(e_b);
+    validate(e_c);
+    validate(e_d);
+  }
+};
+
+} // namespace unittest
 
+#endif // THRUST_CPP_DIALECT >= 2014
diff --git a/thrust/async/scan.h b/thrust/async/scan.h
new file mode 100644
index 0000000000..d7adfac983
--- /dev/null
+++ b/thrust/async/scan.h
@@ -0,0 +1,355 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/scan.h
+ *  \brief Functions for asynchronously computing prefix scans.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/detail/static_assert.h>
+
+#include <thrust/system/detail/adl/async/scan.h>
+
+#include <thrust/type_traits/is_execution_policy.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/future.h>
+
+namespace thrust
+{
+
+namespace async
+{
+
+// Fallback implementations used when no overloads are found via ADL:
+namespace unimplemented
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_inclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_exclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     InitialValueType,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace inclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_inclusive_scan;
+
+// Implementation of the thrust::async::inclusive_scan CPO.
+struct inclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             BinaryOp&& op) const
+    // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  THRUST_NODISCARD
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out,
+                  BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  THRUST_NODISCARD
+  auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace inclusive_scan_detail
+
+THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{};
+
+namespace exclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_exclusive_scan;
+
+// Implementation of the thrust::async::exclusive_scan CPO.
+struct exclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  THRUST_NODISCARD
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  THRUST_NODISCARD
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  THRUST_NODISCARD
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace exclusive_scan_detail
+
+THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{};
+
+} // namespace async
+
+} // end namespace thrust
+
+#endif
diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h
new file mode 100644
index 0000000000..022b7177f4
--- /dev/null
+++ b/thrust/system/cuda/detail/async/exclusive_scan.h
@@ -0,0 +1,180 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. ExclusiveSum instead of ExcScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+auto async_exclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                            ForwardIt first,
+                            Size n,
+                            OutputIt out,
+                            InitialValueType init,
+                            BinaryOp op)
+  -> unique_eager_event
+{
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceScan::ExclusiveScan(
+      nullptr,
+      tmp_size,
+      first,
+      out,
+      op,
+      init,
+      n,
+      nullptr, // Null stream, just for sizing.
+      THRUST_DEBUG_SYNC_FLAG
+    ),
+    "after exclusive_scan sizing"
+  );
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceScan::ExclusiveScan(
+      tmp_ptr,
+      tmp_size,
+      first,
+      out,
+      op,
+      init,
+      n,
+      ev.stream().native_handle(),
+      THRUST_DEBUG_SYNC_FLAG
+    ),
+    "after exclusive_scan launch"
+  );
+
+  return std::move(ev);
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+auto async_exclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          InitialValueType &&init,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_exclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(init),
+    THRUST_FWD(op)
+  )
+)
+
+} // cuda_cub
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h
new file mode 100644
index 0000000000..5f37c19cff
--- /dev/null
+++ b/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -0,0 +1,173 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. InclusiveSum instead of IncScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename BinaryOp>
+auto async_inclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                            ForwardIt first,
+                            Size n,
+                            OutputIt out,
+                            BinaryOp op)
+  -> unique_eager_event
+{
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceScan::InclusiveScan(
+      nullptr,
+      tmp_size,
+      first,
+      out,
+      op,
+      n,
+      nullptr, // Null stream, just for sizing.
+      THRUST_DEBUG_SYNC_FLAG
+    ),
+    "after inclusive_scan sizing"
+  );
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceScan::InclusiveScan(
+      tmp_ptr,
+      tmp_size,
+      first,
+      out,
+      op,
+      n,
+      ev.stream().native_handle(),
+      THRUST_DEBUG_SYNC_FLAG
+    ),
+    "after inclusive_scan launch"
+  );
+
+  return std::move(ev);
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+auto async_inclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_inclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(op)
+  )
+)
+
+} // cuda_cub
+
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/async/scan.h b/thrust/system/cuda/detail/async/scan.h
new file mode 100644
index 0000000000..7d993e6641
--- /dev/null
+++ b/thrust/system/cuda/detail/async/scan.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/cpp14_required.h>
+
+#include <thrust/system/cuda/detail/async/exclusive_scan.h>
+#include <thrust/system/cuda/detail/async/inclusive_scan.h>
diff --git a/thrust/system/detail/adl/async/scan.h b/thrust/system/detail/adl/async/scan.h
new file mode 100644
index 0000000000..a2a90618b4
--- /dev/null
+++ b/thrust/system/detail/adl/async/scan.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/scan.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async scans.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/scan.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/scan.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/scan.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+
diff --git a/thrust/system/detail/generic/scan.inl b/thrust/system/detail/generic/scan.inl
index 300b697b26..83d272c3e9 100644
--- a/thrust/system/detail/generic/scan.inl
+++ b/thrust/system/detail/generic/scan.inl
@@ -61,9 +61,7 @@ __host__ __device__
 {
   // Use the input iterator's value type per https://wg21.link/P0571
   using ValueType = typename thrust::iterator_value<InputIterator>::type;
-
-  // assume 0 as the initialization value
-  return thrust::exclusive_scan(exec, first, last, result, ValueType(0));
+  return thrust::exclusive_scan(exec, first, last, result, ValueType{});
 } // end exclusive_scan()