Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into enh/merge-large-num…
Browse files Browse the repository at this point in the history
…-items
  • Loading branch information
elstehle committed Jan 29, 2025
2 parents cdd40d1 + 09b1200 commit 57d2b17
Show file tree
Hide file tree
Showing 64 changed files with 715 additions and 315 deletions.
14 changes: 7 additions & 7 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -257,13 +257,13 @@ projects:

# testing -> Runner with GPU is in a nv-gh-runners testing pool
gpus:
v100: { sm: 70 } # 32 GB, 40 runners
t4: { sm: 75, testing: true } # 16 GB, 8 runners
rtx2080: { sm: 75, testing: true } # 8 GB, 8 runners
rtxa6000: { sm: 86, testing: true } # 48 GB, 12 runners
l4: { sm: 89, testing: true } # 24 GB, 48 runners
rtx4090: { sm: 89, testing: true } # 24 GB, 10 runners
h100: { sm: 90, testing: true } # 80 GB, 16 runners
v100: { sm: 70 } # 32 GB, 40 runners
t4: { sm: 75 } # 16 GB, 10 runners
rtx2080: { sm: 75 } # 8 GB, 12 runners
rtxa6000: { sm: 86 } # 48 GB, 12 runners
l4: { sm: 89 } # 24 GB, 48 runners
rtx4090: { sm: 89 } # 24 GB, 10 runners
h100: { sm: 90 } # 80 GB, 16 runners

# Tags are used to define a `matrix job` in the workflow section.
#
Expand Down
4 changes: 3 additions & 1 deletion cub/cub/util_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -713,4 +713,6 @@ private:

CUB_NAMESPACE_END

#include <cub/detail/launcher/cuda_runtime.cuh> // to complete the definition of TripleChevronFactory
#if _CCCL_HAS_CUDA_COMPILER
# include <cub/detail/launcher/cuda_runtime.cuh> // to complete the definition of TripleChevronFactory
#endif // _CCCL_HAS_CUDA_COMPILER
8 changes: 4 additions & 4 deletions cudax/examples/stf/linear_algebra/07-cholesky.cu
Original file line number Diff line number Diff line change
Expand Up @@ -659,14 +659,14 @@ int main(int argc, char** argv)
return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row);
};

ctx.dot_push_section("fillA");
auto s = ctx.dot_section("fillA");
if (check_result)
{
Aref.fill(hilbert);
}

A.fill(hilbert);
ctx.dot_pop_section();
s.end();

/* Right-hand side */
matrix<double> B_potrs(N, 1, NB, 1, false, "B");
Expand All @@ -693,9 +693,9 @@ int main(int argc, char** argv)
cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf;
float milliseconds_pdpotrf = 0;

// for (int row = 0; row < A.mt; row++)
// for (size_t row = 0; row < A.mt; row++)
// {
// for (int col = 0; col <= row; col++)
// for (size_t col = 0; col <= row; col++)
// {
// cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col)));
// NOOP(A, row, col);
Expand Down
8 changes: 4 additions & 4 deletions cudax/examples/stf/linear_algebra/07-potri.cu
Original file line number Diff line number Diff line change
Expand Up @@ -197,17 +197,17 @@ public:
void print()
{
// print blocks by blocks
for (int colb = 0; colb < nt; colb++)
for (size_t colb = 0; colb < nt; colb++)
{
int low_rowb = sym_matrix ? colb : 0;
for (int rowb = low_rowb; rowb < mt; rowb++)
for (size_t rowb = low_rowb; rowb < mt; rowb++)
{
// Each task fills a block
ctx.host_launch(get_handle(rowb, colb).read())->*[=](auto sA) {
for (int lcol = 0; lcol < sA.extent(1); lcol++)
for (size_t lcol = 0; lcol < sA.extent(1); lcol++)
{
size_t col = lcol + colb * sA.extent(1);
for (int lrow = 0; lrow < sA.extent(0); lrow++)
for (size_t lrow = 0; lrow < sA.extent(0); lrow++)
{
size_t row = lrow + rowb * sA.extent(0);

Expand Down
9 changes: 8 additions & 1 deletion cudax/examples/stf/linear_algebra/cg_csr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public:
static void copy_vector(const vector& from, vector& to)
{
to.ctx.parallel_for(to.handle.shape(), to.handle.write(), from.handle.read()).set_symbol("copy_vector")
->*[] _CCCL_DEVICE(size_t i, slice<double> dto, slice<double> dfrom) {
->*[] _CCCL_DEVICE(size_t i, slice<double> dto, slice<const double> dfrom) {
dto(i) = dfrom(i);
};
}
Expand Down Expand Up @@ -116,6 +116,13 @@ public:
copy_scalar(a, *this);
}

scalar& operator=(scalar&& a)
{
handle = mv(a.handle);
ctx = mv(a.ctx);
return *this;
}

scalar operator/(scalar const& rhs) const
{
// Submit a task that computes this/rhs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ struct receiver_defaults
};

template <class _Data, class _Rcvr>
struct basic_receiver
struct _CCCL_TYPE_VISIBILITY_DEFAULT basic_receiver
{
using receiver_concept = __async::receiver_t;
using __rcvr_t = typename _Data::receiver_tag;
Expand Down Expand Up @@ -212,10 +212,10 @@ _CUDAX_TRIVIAL_API auto __get_attrs(long, const _Data&, const _Sndrs&... __sndrs
}

template <class _Data, class... _Sndrs>
struct basic_sender;
struct _CCCL_TYPE_VISIBILITY_DEFAULT basic_sender;

template <class _Data, class _Sndr>
struct basic_sender<_Data, _Sndr>
struct _CCCL_TYPE_VISIBILITY_DEFAULT basic_sender<_Data, _Sndr>
{
using sender_concept = __async::sender_t;
using __tag_t = typename _Data::sender_tag;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ struct __cond_t
};

template <class _Sndr, class _Pred, class _Then, class _Else>
struct __sndr_t;
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;

template <class _Pred, class _Then, class _Else>
struct __closure
Expand Down Expand Up @@ -174,7 +174,7 @@ struct __cond_t
};

template <class _Sndr, class _Pred, class _Then, class _Else>
struct __cond_t::__sndr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __cond_t::__sndr_t
{
__cond_t __tag_;
__cond_t::__data<_Pred, _Then, _Else> __data_;
Expand Down
10 changes: 5 additions & 5 deletions cudax/include/cuda/experimental/__async/sender/continue_on.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ private:
completion_signatures<set_error_t(__decay_t<_Error>), set_error_t(::std::exception_ptr)>>;

template <class _Rcvr, class _Result>
struct __rcvr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t
{
using receiver_concept = receiver_t;
_Rcvr __rcvr_;
Expand Down Expand Up @@ -127,7 +127,7 @@ private:
};

template <class _Rcvr, class _CvSndr, class _Sch>
struct __opstate_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t
{
_CUDAX_API friend auto get_env(const __opstate_t* __self) noexcept -> env_of_t<_Rcvr>
{
Expand Down Expand Up @@ -197,7 +197,7 @@ private:
};

template <class _Sndr, class _Sch>
struct __sndr_t;
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;

template <class _Sch>
struct __closure_t;
Expand All @@ -211,7 +211,7 @@ public:
};

template <class _Sch>
struct continue_on_t::__closure_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT continue_on_t::__closure_t
{
_Sch __sch;

Expand All @@ -223,7 +223,7 @@ struct continue_on_t::__closure_t
};

template <class _Sndr, class _Sch>
struct continue_on_t::__sndr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT continue_on_t::__sndr_t
{
using sender_concept = sender_t;
_CCCL_NO_UNIQUE_ADDRESS continue_on_t __tag;
Expand Down
6 changes: 3 additions & 3 deletions cudax/include/cuda/experimental/__async/sender/env.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ template <class _Ty>
using __unwrap_reference_t = decltype(__unwrap_ref<_Ty>);

template <class _Query, class _Value>
struct prop
struct _CCCL_TYPE_VISIBILITY_DEFAULT prop
{
_CCCL_NO_UNIQUE_ADDRESS _Query __query;
_CCCL_NO_UNIQUE_ADDRESS _Value __value;
Expand All @@ -77,7 +77,7 @@ struct prop
};

template <class... _Envs>
struct env
struct _CCCL_TYPE_VISIBILITY_DEFAULT env
{
__tuple<_Envs...> __envs_;

Expand Down Expand Up @@ -108,7 +108,7 @@ struct env

// partial specialization for two environments
template <class _Env0, class _Env1>
struct env<_Env0, _Env1>
struct _CCCL_TYPE_VISIBILITY_DEFAULT env<_Env0, _Env1>
{
_CCCL_NO_UNIQUE_ADDRESS _Env0 __env0_;
_CCCL_NO_UNIQUE_ADDRESS _Env1 __env1_;
Expand Down
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__async/sender/just.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ private:
using _SetTag = decltype(__detail::__set_tag<_Disposition>());

template <class _Rcvr, class... _Ts>
struct __opstate_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t
{
using operation_state_concept = operation_state_t;
using completion_signatures = __async::completion_signatures<_SetTag(_Ts...)>;
Expand All @@ -85,7 +85,7 @@ private:
};

template <class... _Ts>
struct __sndr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t
{
using sender_concept = sender_t;
using completion_signatures = __async::completion_signatures<_SetTag(_Ts...)>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ private:
};

template <class _Fn>
struct __sndr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t
{
using sender_concept = sender_t;

Expand Down
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__async/sender/let_value.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ private:
/// @tparam _Rcvr The receiver connected to the `let_(value|error|stopped)`
/// sender.
template <class _Rcvr, class _CvSndr, class _Fn>
struct __opstate_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t
{
_CUDAX_API friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept
{
Expand Down Expand Up @@ -235,7 +235,7 @@ private:
/// @tparam _Fn The function to be called when the predecessor sender
/// completes.
template <class _Sndr, class _Fn>
struct __sndr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t
{
using sender_concept = sender_t;
_CCCL_NO_UNIQUE_ADDRESS _LetTag __tag_;
Expand Down
8 changes: 4 additions & 4 deletions cudax/include/cuda/experimental/__async/sender/read_env.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ private:
};

template <class _Rcvr, class _Query>
struct __opstate_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t
{
using operation_state_concept = operation_state_t;
using completion_signatures = //
Expand Down Expand Up @@ -107,7 +107,7 @@ private:

// This makes read_env a dependent sender:
template <class _Query>
struct __opstate_t<receiver_archetype, _Query>
struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t<receiver_archetype, _Query>
{
using operation_state_concept = operation_state_t;
using completion_signatures = dependent_completions;
Expand All @@ -116,7 +116,7 @@ private:
};

template <class _Query>
struct __sndr_t;
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;

public:
/// @brief Returns a sender that, when connected to a receiver and started,
Expand All @@ -127,7 +127,7 @@ public:
};

template <class _Query>
struct read_env_t::__sndr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT read_env_t::__sndr_t
{
using sender_concept = sender_t;
_CCCL_NO_UNIQUE_ADDRESS read_env_t __tag;
Expand Down
6 changes: 3 additions & 3 deletions cudax/include/cuda/experimental/__async/sender/sequence.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ struct __seq
};

template <class _Zip>
struct __opstate
struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate
{
using operation_state_concept = operation_state_t;

Expand Down Expand Up @@ -99,14 +99,14 @@ struct __seq
};

template <class _Sndr1, class _Sndr2>
struct __sndr_t;
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;

template <class _Sndr1, class _Sndr2>
_CUDAX_API auto operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const -> __sndr_t<_Sndr1, _Sndr2>;
};

template <class _Sndr1, class _Sndr2>
struct __seq::__sndr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __seq::__sndr_t
{
using sender_concept = sender_t;
using __sndr1_t = _Sndr1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ private:
struct __opstate_base_t : __immovable
{};

struct __rcvr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t
{
using receiver_concept = receiver_t;

Expand All @@ -66,7 +66,7 @@ private:
};

template <class _Sndr>
struct __opstate_t : __opstate_base_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t : __opstate_base_t
{
using operation_state_concept = operation_state_t;
using completion_signatures = __async::completion_signatures_of_t<_Sndr, __rcvr_t>;
Expand Down
6 changes: 3 additions & 3 deletions cudax/include/cuda/experimental/__async/sender/start_on.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ private:
#endif // !_CCCL_CUDA_COMPILER(NVCC)

template <class _Rcvr, class _Sch, class _CvSndr>
struct __opstate_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t
{
_CUDAX_API friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept
{
Expand Down Expand Up @@ -103,7 +103,7 @@ private:
};

template <class _Sch, class _Sndr>
struct __sndr_t;
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;

public:
template <class _Sch, class _Sndr>
Expand All @@ -112,7 +112,7 @@ public:
} start_on{};

template <class _Sch, class _Sndr>
struct start_on_t::__sndr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT start_on_t::__sndr_t
{
using sender_concept = sender_t;
_CCCL_NO_UNIQUE_ADDRESS start_on_t __tag_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ private:
template <class _Sndr>
struct __state_t
{
struct __rcvr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t
{
using receiver_concept = receiver_t;
__state_t* __state_;
Expand Down
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__async/sender/then.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ private:
__type_try_quote<__concat_completion_signatures>::__call>;

template <class _Rcvr, class _CvSndr, class _Fn>
struct __opstate_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t
{
_CUDAX_API friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept
{
Expand Down Expand Up @@ -213,7 +213,7 @@ private:
};

template <class _Fn, class _Sndr>
struct __sndr_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t
{
using sender_concept = sender_t;
_CCCL_NO_UNIQUE_ADDRESS _UponTag __tag_;
Expand Down
Loading

0 comments on commit 57d2b17

Please sign in to comment.