diff --git a/CMakePresets.json b/CMakePresets.json index 5940422fa47..a78a7c57edd 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -84,7 +84,6 @@ "cudax_ENABLE_CUDASTF": true, "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false, "cudax_ENABLE_CUDASTF_CODE_GENERATION": true, - "cudax_ENABLE_CUDASTF_DEBUG": false, "cudax_ENABLE_CUDASTF_MATHLIBS": false, "cudax_ENABLE_DIALECT_CPP17": true, "cudax_ENABLE_DIALECT_CPP20": true, @@ -100,8 +99,7 @@ "CMAKE_BUILD_TYPE": "Debug", "CMAKE_CUDA_FLAGS": "-G", "CCCL_ENABLE_BENCHMARKS": false, - "cudax_ENABLE_CUDASTF_BOUNDSCHECK": true, - "cudax_ENABLE_CUDASTF_DEBUG": true + "cudax_ENABLE_CUDASTF_BOUNDSCHECK": true } }, { diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt index a9725331592..86d2475988d 100644 --- a/cudax/CMakeLists.txt +++ b/cudax/CMakeLists.txt @@ -23,12 +23,11 @@ option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON) option(cudax_ENABLE_CUDASTF "Enable CUDASTF subproject" ON) option(cudax_ENABLE_CUDASTF_CODE_GENERATION "Enable code generation using STF's parallel_for or launch with CUDA compiler." ON) option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF) -option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF) option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF) -if ((cudax_ENABLE_CUDASTF_BOUNDSCHECK OR cudax_ENABLE_CUDASTF_DEBUG) AND +if (cudax_ENABLE_CUDASTF_BOUNDSCHECK AND NOT CMAKE_BUILD_TYPE MATCHES "Debug" AND NOT CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo") - message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK and cudax_ENABLE_CUDASTF_DEBUG require a Debug build.") + message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK requires a Debug build.") endif() include(cmake/cudaxBuildCompilerTargets.cmake) diff --git a/cudax/cmake/cudaxSTFConfigureTarget.cmake b/cudax/cmake/cudaxSTFConfigureTarget.cmake index 26ddfd0060e..81c52e29722 100644 --- a/cudax/cmake/cudaxSTFConfigureTarget.cmake +++ b/cudax/cmake/cudaxSTFConfigureTarget.cmake @@ -41,10 +41,4 @@ function(cudax_stf_configure_target target_name) "CUDASTF_BOUNDSCHECK" ) endif() - - if (cudax_ENABLE_CUDASTF_DEBUG) - target_compile_definitions(${target_name} PRIVATE - "CUDASTF_DEBUG" - ) - endif() endfunction() diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh index 58ca20b0873..77c43ab0770 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh @@ -39,29 +39,6 @@ namespace cuda::experimental::stf { -namespace reserved -{ - -// For counters -class graph_tag -{ -public: - class launch - {}; - class instantiate - {}; - class update - { - public: - class success - {}; - class failure - {}; - }; -}; - -} // end namespace reserved - /** * @brief Uncached allocator (used as a base for other allocators) * @@ -348,23 +325,6 @@ public: state.submitted_stream = nullptr; state.cleanup(); set_phase(backend_ctx_untyped::phase::finalized); - -#ifdef CUDASTF_DEBUG - const char* display_stats_env = getenv("CUDASTF_DISPLAY_STATS"); - if (!display_stats_env || atoi(display_stats_env) == 0) - { - return; - } - - fprintf( - stderr, "[STATS CUDA GRAPHS] instantiated=%lu\n", reserved::counter.load()); - fprintf(stderr, "[STATS CUDA GRAPHS] launched=%lu\n", reserved::counter.load()); - fprintf(stderr, - "[STATS CUDA GRAPHS] updated=%lu success=%ld failed=%ld\n", - reserved::counter.load(), - reserved::counter.load(), - reserved::counter.load()); -#endif } void submit(cudaStream_t stream = nullptr) @@ -394,10 +354,6 @@ public: cuda_try(cudaGraphLaunch(*state.exec_graph, state.submitted_stream)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif - // Note that we comment this out for now, so that it is possible to use // the print_to_dot method; but we may perhaps discard this graph to // some dedicated member variable. @@ -614,10 +570,6 @@ private: cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif - return res; } @@ -712,10 +664,6 @@ private: cuda_try(cudaGraphLaunch(local_exec_graph, state.submitted_stream)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif - return state.submitted_stream; } diff --git a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh index f6e79500d11..aba0c3e3735 100644 --- a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh @@ -433,15 +433,6 @@ inline event_list event_impl::from_stream(backend_ctx_untyped&, cudaStream_t) co } _CCCL_DIAG_POP -namespace reserved -{ - -// For counters -class join_tag -{}; - -} // end namespace reserved - /** * @brief Introduce a dependency from all entries of an event list to an event. @@ -474,7 +465,6 @@ void join(context_t& ctx, some_event& to, event_list& prereq_in) { from = static_cast(item.operator->()); } - reserved::counter::increment(); to.insert_dep(ctx.async_resources(), *from); from->outbound_deps++; } diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index 56f4a719a4b..06fed8c7ff7 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -554,11 +554,8 @@ protected: reserved::dot::instance().is_timing()); // We generate symbols if we may use them -#ifdef CUDASTF_DEBUG - generate_event_symbols = true; -#else generate_event_symbols = dot->is_tracing_prereqs(); -#endif + // Record it in the list of all traced contexts reserved::dot::instance().per_ctx.push_back(dot); } @@ -575,8 +572,6 @@ protected: } display_transfers(); - - fprintf(stderr, "TOTAL SYNC COUNT: %lu\n", reserved::counter::load()); } impl(const impl&) = delete; diff --git a/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh b/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh index 9b60beb6345..40d2b1bf30d 100644 --- a/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh @@ -54,18 +54,6 @@ inline bool try_updating_executable_graph(cudaGraphExec_t exec_graph, cudaGraph_ // Be sure to "erase" the last error cudaError_t res = cudaGetLastError(); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); - if (res == cudaSuccess) - { - reserved::counter.increment(); - } - else - { - reserved::counter.increment(); - } -#endif - return (res == cudaSuccess); } @@ -81,10 +69,6 @@ inline ::std::shared_ptr graph_instantiate(cudaGraph_t g) cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif - return res; } diff --git a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh index e68dca7d0bf..b2f7320192f 100644 --- a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh @@ -42,21 +42,6 @@ inline event join_with_stream( using stream_and_event_vector = small_vector, 7>; -/* Tag types for event counters */ -class cuda_event_tag -{ -public: - class created - {}; - class alive - {}; - class destroyed - {}; -}; - -class cuda_stream_wait_event_tag -{}; - } // namespace reserved /* This event type allows to synchronize a CUDA stream with a CUDA event in @@ -77,10 +62,6 @@ protected: if (cudaEvent) { cuda_safe_call(cudaEventDestroy(cudaEvent)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::decrement(); -#endif // fprintf(stderr, "DESTROY EVENT %p #%d (created %d)\n", event, ++destroyed_event_cnt, // event_cnt); @@ -129,27 +110,13 @@ public: // Disable timing to avoid implicit barriers cudaEvent_t sync_event; cuda_safe_call(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::increment(); - reserved::high_water_mark::record( - reserved::counter::load()); -#endif - cuda_safe_call(cudaEventRecord(sync_event, s2)); // According to documentation "event may be from a different device than stream." cuda_safe_call(cudaStreamWaitEvent(s1, sync_event, 0)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif // Asynchronously destroy event to avoid a memleak cuda_safe_call(cudaEventDestroy(sync_event)); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::decrement(); -#endif }; } @@ -167,12 +134,6 @@ public: cuda_safe_call(cudaEventCreateWithFlags(&cudaEvent, cudaEventDisableTiming)); // fprintf(stderr, "CREATE EVENT %p %s\n", cudaEvent, get_symbol().c_str()); assert(cudaEvent); -#ifdef CUDASTF_DEBUG - reserved::counter::increment(); - reserved::counter::increment(); - reserved::high_water_mark::record( - reserved::counter::load()); -#endif cuda_safe_call(cudaEventRecord(cudaEvent, dstream.stream)); }; } @@ -186,9 +147,6 @@ public: if (!skip) { cuda_safe_call(cudaStreamWaitEvent(dstream.stream, from.cudaEvent, 0)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif } } } @@ -415,9 +373,6 @@ private: if (!skip) { cuda_safe_call(cudaStreamWaitEvent(dstream.stream, se->get_cuda_event(), 0)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); -#endif } } se->outbound_deps++; diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh index bb846ed65cb..c515233811b 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh @@ -36,6 +36,7 @@ #include // For implicit logical_data_untyped constructors #include #include +#include // for reserved::counter namespace cuda::experimental::stf { @@ -516,38 +517,6 @@ public: } state.cleanup(); set_phase(backend_ctx_untyped::phase::finalized); - -#ifdef CUDASTF_DEBUG - // Ensure that the total number of CUDA events created corresponds to - // the number of events destroyed - const auto alive = reserved::counter.load(); - if (alive != 0) - { - fprintf(stderr, - "WARNING!!! %lu CUDA events leaked (approx %lu created vs. %lu destroyed).\n", - alive, - reserved::counter.load(), - reserved::counter.load()); - } - - assert(alive == 0); - - const char* display_stats_env = getenv("CUDASTF_DISPLAY_STATS"); - if (!display_stats_env || atoi(display_stats_env) == 0) - { - return; - } - - fprintf(stderr, - "[STATS CUDA EVENTS] created=%lu destroyed=%lu alive=%lu reserved::high_water_mark=%lu\n", - reserved::counter.load(), - reserved::counter.load(), - alive, - reserved::high_water_mark.load()); - fprintf(stderr, - "[STATS CUDA EVENTS] cuda_stream_wait_event=%lu\n", - reserved::counter.load()); -#endif } float get_submission_time_ms() const diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index 348136778b7..d1f51751dd0 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -415,10 +415,6 @@ private: cudaEvent_t sync_event; // Disable timing to avoid implicit barriers cuda_safe_call(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming)); -#ifdef CUDASTF_DEBUG - reserved::counter ++; - reserved::high_water_mark.record(++reserved::counter); -#endif cuda_safe_call(cudaEventRecord(sync_event, streams[0].stream)); @@ -430,10 +426,6 @@ private: // Asynchronously destroy event to avoid a memleak cuda_safe_call(cudaEventDestroy(sync_event)); -#ifdef CUDASTF_DEBUG - reserved::counter.increment(); - reserved::counter.decrement(); -#endif if (current_dev != s0_dev) {