From 0520e8f538429e83d04633dd3af81b628f1d8648 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 26 Feb 2025 13:24:29 +0100 Subject: [PATCH 1/4] Remove outdated debug code that is no longer needed because we have better instrumentation available --- .../experimental/__stf/graph/graph_ctx.cuh | 52 ------------------- .../__stf/internal/executable_graph_cache.cuh | 16 ------ 2 files changed, 68 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh index 58ca20b0873..77c43ab0770 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh @@ -39,29 +39,6 @@ namespace cuda::experimental::stf { -namespace reserved -{ - -// For counters -class graph_tag -{ -public: - class launch - {}; - class instantiate - {}; - class update - { - public: - class success - {}; - class failure - {}; - }; -}; - -} // end namespace reserved - /** * @brief Uncached allocator (used as a base for other allocators) * @@ -348,23 +325,6 @@ public: state.submitted_stream = nullptr; state.cleanup(); set_phase(backend_ctx_untyped::phase::finalized); - -#ifdef CUDASTF_DEBUG - const char* display_stats_env = getenv("CUDASTF_DISPLAY_STATS"); - if (!display_stats_env || atoi(display_stats_env) == 0) - { - return; - } - - fprintf( - stderr, "[STATS CUDA GRAPHS] instantiated=%lu\n", reserved::counter.load()); - fprintf(stderr, "[STATS CUDA GRAPHS] launched=%lu\n", reserved::counter.load()); - fprintf(stderr, - "[STATS CUDA GRAPHS] updated=%lu success=%ld failed=%ld\n", - reserved::counter.load(), - reserved::counter.load(), - reserved::counter.load()); -#endif } void submit(cudaStream_t stream = nullptr) @@ -394,10 +354,6 @@ public: cuda_try(cudaGraphLaunch(*state.exec_graph, state.submitted_stream)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif - // Note that we comment this out for now, so that it is possible to use // the print_to_dot method; but we may perhaps discard this graph to // some dedicated member variable. @@ -614,10 +570,6 @@ private: cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif - return res; } @@ -712,10 +664,6 @@ private: cuda_try(cudaGraphLaunch(local_exec_graph, state.submitted_stream)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif - return state.submitted_stream; } diff --git a/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh b/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh index 9b60beb6345..40d2b1bf30d 100644 --- a/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh @@ -54,18 +54,6 @@ inline bool try_updating_executable_graph(cudaGraphExec_t exec_graph, cudaGraph_ // Be sure to "erase" the last error cudaError_t res = cudaGetLastError(); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); - if (res == cudaSuccess) - { - reserved::counter.increment(); - } - else - { - reserved::counter.increment(); - } -#endif - return (res == cudaSuccess); } @@ -81,10 +69,6 @@ inline ::std::shared_ptr graph_instantiate(cudaGraph_t g) cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif - return res; } From b0216bad0b663959867ee245c110fe7253c653f7 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 26 Feb 2025 14:48:28 +0100 Subject: [PATCH 2/4] Misc. compilation fixes for CUDASTF_DEBUG --- .../__stf/stream/internal/event_types.cuh | 10 +++++----- .../cuda/experimental/__stf/stream/stream_ctx.cuh | 15 ++++++++------- .../experimental/__stf/stream/stream_task.cuh | 10 ++++++---- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh index e68dca7d0bf..9017131425c 100644 --- a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh @@ -133,7 +133,7 @@ public: reserved::counter::increment(); reserved::counter::increment(); reserved::high_water_mark::record( - reserved::counter::load()); + reserved::counter::load()); #endif cuda_safe_call(cudaEventRecord(sync_event, s2)); @@ -141,7 +141,7 @@ public: // According to documentation "event may be from a different device than stream." cuda_safe_call(cudaStreamWaitEvent(s1, sync_event, 0)); #ifdef CUDASTF_DEBUG - reserved::counter.increment(); + reserved::counter::increment(); #endif // Asynchronously destroy event to avoid a memleak @@ -171,7 +171,7 @@ public: reserved::counter::increment(); reserved::counter::increment(); reserved::high_water_mark::record( - reserved::counter::load()); + reserved::counter::load()); #endif cuda_safe_call(cudaEventRecord(cudaEvent, dstream.stream)); }; @@ -187,7 +187,7 @@ public: { cuda_safe_call(cudaStreamWaitEvent(dstream.stream, from.cudaEvent, 0)); #ifdef CUDASTF_DEBUG - reserved::counter.increment(); + reserved::counter::increment(); #endif } } @@ -416,7 +416,7 @@ private: { cuda_safe_call(cudaStreamWaitEvent(dstream.stream, se->get_cuda_event(), 0)); #ifdef CUDASTF_DEBUG - reserved::counter.increment(); + reserved::counter::increment(); #endif } } diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh index bb846ed65cb..3165d3a97a4 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh @@ -36,6 +36,7 @@ #include // For implicit logical_data_untyped constructors #include #include +#include // for reserved::counter namespace cuda::experimental::stf { @@ -520,14 +521,14 @@ public: #ifdef CUDASTF_DEBUG // Ensure that the total number of CUDA events created corresponds to // the number of events destroyed - const auto alive = reserved::counter.load(); + const auto alive = reserved::counter::load(); if (alive != 0) { fprintf(stderr, "WARNING!!! %lu CUDA events leaked (approx %lu created vs. %lu destroyed).\n", alive, - reserved::counter.load(), - reserved::counter.load()); + reserved::counter::load(), + reserved::counter::load()); } assert(alive == 0); @@ -540,13 +541,13 @@ public: fprintf(stderr, "[STATS CUDA EVENTS] created=%lu destroyed=%lu alive=%lu reserved::high_water_mark=%lu\n", - reserved::counter.load(), - reserved::counter.load(), + reserved::counter::load(), + reserved::counter::load(), alive, - reserved::high_water_mark.load()); + reserved::high_water_mark::load()); fprintf(stderr, "[STATS CUDA EVENTS] cuda_stream_wait_event=%lu\n", - reserved::counter.load()); + reserved::counter::load()); #endif } diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index 348136778b7..cb2fa6b48b7 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -416,8 +416,10 @@ private: // Disable timing to avoid implicit barriers cuda_safe_call(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming)); #ifdef CUDASTF_DEBUG - reserved::counter ++; - reserved::high_water_mark.record(++reserved::counter); + reserved::counter::increment(); + reserved::counter::increment(); + reserved::high_water_mark::record( + reserved::counter::load()); #endif cuda_safe_call(cudaEventRecord(sync_event, streams[0].stream)); @@ -431,8 +433,8 @@ private: // Asynchronously destroy event to avoid a memleak cuda_safe_call(cudaEventDestroy(sync_event)); #ifdef CUDASTF_DEBUG - reserved::counter.increment(); - reserved::counter.decrement(); + reserved::counter::increment(); + reserved::counter::decrement(); #endif if (current_dev != s0_dev) From 10dbae2efa7bbfa686b1c40066447511be183a88 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 26 Feb 2025 15:22:44 +0100 Subject: [PATCH 3/4] Entirely remove CUDASTF_DEBUG --- CMakePresets.json | 4 +-- cudax/CMakeLists.txt | 5 ++- cudax/cmake/cudaxSTFConfigureTarget.cmake | 6 ---- .../__stf/internal/backend_ctx.cuh | 5 +-- .../__stf/stream/internal/event_types.cuh | 30 ----------------- .../experimental/__stf/stream/stream_ctx.cuh | 32 ------------------- .../experimental/__stf/stream/stream_task.cuh | 10 ------ 7 files changed, 4 insertions(+), 88 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 5940422fa47..a78a7c57edd 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -84,7 +84,6 @@ "cudax_ENABLE_CUDASTF": true, "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false, "cudax_ENABLE_CUDASTF_CODE_GENERATION": true, - "cudax_ENABLE_CUDASTF_DEBUG": false, "cudax_ENABLE_CUDASTF_MATHLIBS": false, "cudax_ENABLE_DIALECT_CPP17": true, "cudax_ENABLE_DIALECT_CPP20": true, @@ -100,8 +99,7 @@ "CMAKE_BUILD_TYPE": "Debug", "CMAKE_CUDA_FLAGS": "-G", "CCCL_ENABLE_BENCHMARKS": false, - "cudax_ENABLE_CUDASTF_BOUNDSCHECK": true, - "cudax_ENABLE_CUDASTF_DEBUG": true + "cudax_ENABLE_CUDASTF_BOUNDSCHECK": true } }, { diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt index a9725331592..86d2475988d 100644 --- a/cudax/CMakeLists.txt +++ b/cudax/CMakeLists.txt @@ -23,12 +23,11 @@ option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON) option(cudax_ENABLE_CUDASTF "Enable CUDASTF subproject" ON) option(cudax_ENABLE_CUDASTF_CODE_GENERATION "Enable code generation using STF's parallel_for or launch with CUDA compiler." ON) option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF) -option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF) option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF) -if ((cudax_ENABLE_CUDASTF_BOUNDSCHECK OR cudax_ENABLE_CUDASTF_DEBUG) AND +if (cudax_ENABLE_CUDASTF_BOUNDSCHECK AND NOT CMAKE_BUILD_TYPE MATCHES "Debug" AND NOT CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo") - message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK and cudax_ENABLE_CUDASTF_DEBUG require a Debug build.") + message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK requires a Debug build.") endif() include(cmake/cudaxBuildCompilerTargets.cmake) diff --git a/cudax/cmake/cudaxSTFConfigureTarget.cmake b/cudax/cmake/cudaxSTFConfigureTarget.cmake index 26ddfd0060e..81c52e29722 100644 --- a/cudax/cmake/cudaxSTFConfigureTarget.cmake +++ b/cudax/cmake/cudaxSTFConfigureTarget.cmake @@ -41,10 +41,4 @@ function(cudax_stf_configure_target target_name) "CUDASTF_BOUNDSCHECK" ) endif() - - if (cudax_ENABLE_CUDASTF_DEBUG) - target_compile_definitions(${target_name} PRIVATE - "CUDASTF_DEBUG" - ) - endif() endfunction() diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index 56f4a719a4b..0d44a2132d7 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -554,11 +554,8 @@ protected: reserved::dot::instance().is_timing()); // We generate symbols if we may use them -#ifdef CUDASTF_DEBUG - generate_event_symbols = true; -#else generate_event_symbols = dot->is_tracing_prereqs(); -#endif + // Record it in the list of all traced contexts reserved::dot::instance().per_ctx.push_back(dot); } diff --git a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh index 9017131425c..5528b167ce6 100644 --- a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh @@ -77,10 +77,6 @@ protected: if (cudaEvent) { cuda_safe_call(cudaEventDestroy(cudaEvent)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::decrement(); -#endif // fprintf(stderr, "DESTROY EVENT %p #%d (created %d)\n", event, ++destroyed_event_cnt, // event_cnt); @@ -129,27 +125,13 @@ public: // Disable timing to avoid implicit barriers cudaEvent_t sync_event; cuda_safe_call(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::increment(); - reserved::high_water_mark::record( - reserved::counter::load()); -#endif - cuda_safe_call(cudaEventRecord(sync_event, s2)); // According to documentation "event may be from a different device than stream." cuda_safe_call(cudaStreamWaitEvent(s1, sync_event, 0)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); -#endif // Asynchronously destroy event to avoid a memleak cuda_safe_call(cudaEventDestroy(sync_event)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::decrement(); -#endif }; } @@ -167,12 +149,6 @@ public: cuda_safe_call(cudaEventCreateWithFlags(&cudaEvent, cudaEventDisableTiming)); // fprintf(stderr, "CREATE EVENT %p %s\n", cudaEvent, get_symbol().c_str()); assert(cudaEvent); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::increment(); - reserved::high_water_mark::record( - reserved::counter::load()); -#endif cuda_safe_call(cudaEventRecord(cudaEvent, dstream.stream)); }; } @@ -186,9 +162,6 @@ public: if (!skip) { cuda_safe_call(cudaStreamWaitEvent(dstream.stream, from.cudaEvent, 0)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); -#endif } } } @@ -415,9 +388,6 @@ private: if (!skip) { cuda_safe_call(cudaStreamWaitEvent(dstream.stream, se->get_cuda_event(), 0)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); -#endif } } se->outbound_deps++; diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh index 3165d3a97a4..c515233811b 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh @@ -517,38 +517,6 @@ public: } state.cleanup(); set_phase(backend_ctx_untyped::phase::finalized); - -#ifdef CUDASTF_DEBUG - // Ensure that the total number of CUDA events created corresponds to - // the number of events destroyed - const auto alive = reserved::counter::load(); - if (alive != 0) - { - fprintf(stderr, - "WARNING!!! %lu CUDA events leaked (approx %lu created vs. %lu destroyed).\n", - alive, - reserved::counter::load(), - reserved::counter::load()); - } - - assert(alive == 0); - - const char* display_stats_env = getenv("CUDASTF_DISPLAY_STATS"); - if (!display_stats_env || atoi(display_stats_env) == 0) - { - return; - } - - fprintf(stderr, - "[STATS CUDA EVENTS] created=%lu destroyed=%lu alive=%lu reserved::high_water_mark=%lu\n", - reserved::counter::load(), - reserved::counter::load(), - alive, - reserved::high_water_mark::load()); - fprintf(stderr, - "[STATS CUDA EVENTS] cuda_stream_wait_event=%lu\n", - reserved::counter::load()); -#endif } float get_submission_time_ms() const diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index cb2fa6b48b7..d1f51751dd0 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -415,12 +415,6 @@ private: cudaEvent_t sync_event; // Disable timing to avoid implicit barriers cuda_safe_call(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::increment(); - reserved::high_water_mark::record( - reserved::counter::load()); -#endif cuda_safe_call(cudaEventRecord(sync_event, streams[0].stream)); @@ -432,10 +426,6 @@ private: // Asynchronously destroy event to avoid a memleak cuda_safe_call(cudaEventDestroy(sync_event)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::decrement(); -#endif if (current_dev != s0_dev) { From 823b2c0d782ef75a3c36aa98115c1d570f1b9443 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 26 Feb 2025 15:30:55 +0100 Subject: [PATCH 4/4] remove more outdated debug code --- .../experimental/__stf/internal/async_prereq.cuh | 10 ---------- .../experimental/__stf/internal/backend_ctx.cuh | 2 -- .../__stf/stream/internal/event_types.cuh | 15 --------------- 3 files changed, 27 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh index f6e79500d11..aba0c3e3735 100644 --- a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh @@ -433,15 +433,6 @@ inline event_list event_impl::from_stream(backend_ctx_untyped&, cudaStream_t) co } _CCCL_DIAG_POP -namespace reserved -{ - -// For counters -class join_tag -{}; - -} // end namespace reserved - /** * @brief Introduce a dependency from all entries of an event list to an event. @@ -474,7 +465,6 @@ void join(context_t& ctx, some_event& to, event_list& prereq_in) { from = static_cast(item.operator->()); } - reserved::counter::increment(); to.insert_dep(ctx.async_resources(), *from); from->outbound_deps++; } diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index 0d44a2132d7..06fed8c7ff7 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -572,8 +572,6 @@ protected: } display_transfers(); - - fprintf(stderr, "TOTAL SYNC COUNT: %lu\n", reserved::counter::load()); } impl(const impl&) = delete; diff --git a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh index 5528b167ce6..b2f7320192f 100644 --- a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh @@ -42,21 +42,6 @@ inline event join_with_stream( using stream_and_event_vector = small_vector, 7>; -/* Tag types for event counters */ -class cuda_event_tag -{ -public: - class created - {}; - class alive - {}; - class destroyed - {}; -}; - -class cuda_stream_wait_event_tag -{}; - } // namespace reserved /* This event type allows to synchronize a CUDA stream with a CUDA event in