diff --git a/CMakePresets.json b/CMakePresets.json
index 5940422fa47..a78a7c57edd 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -84,7 +84,6 @@
         "cudax_ENABLE_CUDASTF": true,
         "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
         "cudax_ENABLE_CUDASTF_CODE_GENERATION": true,
-        "cudax_ENABLE_CUDASTF_DEBUG": false,
         "cudax_ENABLE_CUDASTF_MATHLIBS": false,
         "cudax_ENABLE_DIALECT_CPP17": true,
         "cudax_ENABLE_DIALECT_CPP20": true,
@@ -100,8 +99,7 @@
         "CMAKE_BUILD_TYPE": "Debug",
         "CMAKE_CUDA_FLAGS": "-G",
         "CCCL_ENABLE_BENCHMARKS": false,
-        "cudax_ENABLE_CUDASTF_BOUNDSCHECK": true,
-        "cudax_ENABLE_CUDASTF_DEBUG": true
+        "cudax_ENABLE_CUDASTF_BOUNDSCHECK": true
       }
     },
     {
diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt
index a9725331592..86d2475988d 100644
--- a/cudax/CMakeLists.txt
+++ b/cudax/CMakeLists.txt
@@ -23,12 +23,11 @@ option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON)
 option(cudax_ENABLE_CUDASTF "Enable CUDASTF subproject" ON)
 option(cudax_ENABLE_CUDASTF_CODE_GENERATION "Enable code generation using STF's parallel_for or launch with CUDA compiler." ON)
 option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF)
-option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF)
 option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF)
 
-if ((cudax_ENABLE_CUDASTF_BOUNDSCHECK OR cudax_ENABLE_CUDASTF_DEBUG) AND
+if (cudax_ENABLE_CUDASTF_BOUNDSCHECK AND
     NOT CMAKE_BUILD_TYPE MATCHES "Debug" AND NOT CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo")
-  message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK and cudax_ENABLE_CUDASTF_DEBUG require a Debug build.")
+  message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK requires a Debug build.")
 endif()
 
 include(cmake/cudaxBuildCompilerTargets.cmake)
diff --git a/cudax/cmake/cudaxSTFConfigureTarget.cmake b/cudax/cmake/cudaxSTFConfigureTarget.cmake
index 26ddfd0060e..81c52e29722 100644
--- a/cudax/cmake/cudaxSTFConfigureTarget.cmake
+++ b/cudax/cmake/cudaxSTFConfigureTarget.cmake
@@ -41,10 +41,4 @@ function(cudax_stf_configure_target target_name)
       "CUDASTF_BOUNDSCHECK"
     )
   endif()
-
-  if (cudax_ENABLE_CUDASTF_DEBUG)
-    target_compile_definitions(${target_name} PRIVATE
-      "CUDASTF_DEBUG"
-    )
-  endif()
 endfunction()
diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
index 58ca20b0873..77c43ab0770 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
@@ -39,29 +39,6 @@
 namespace cuda::experimental::stf
 {
 
-namespace reserved
-{
-
-// For counters
-class graph_tag
-{
-public:
-  class launch
-  {};
-  class instantiate
-  {};
-  class update
-  {
-  public:
-    class success
-    {};
-    class failure
-    {};
-  };
-};
-
-} // end namespace reserved
-
 /**
  * @brief Uncached allocator (used as a base for other allocators)
  *
@@ -348,23 +325,6 @@ public:
     state.submitted_stream = nullptr;
     state.cleanup();
     set_phase(backend_ctx_untyped::phase::finalized);
-
-#ifdef CUDASTF_DEBUG
-    const char* display_stats_env = getenv("CUDASTF_DISPLAY_STATS");
-    if (!display_stats_env || atoi(display_stats_env) == 0)
-    {
-      return;
-    }
-
-    fprintf(
-      stderr, "[STATS CUDA GRAPHS] instantiated=%lu\n", reserved::counter<reserved::graph_tag::instantiate>.load());
-    fprintf(stderr, "[STATS CUDA GRAPHS] launched=%lu\n", reserved::counter<reserved::graph_tag::launch>.load());
-    fprintf(stderr,
-            "[STATS CUDA GRAPHS] updated=%lu success=%ld failed=%ld\n",
-            reserved::counter<reserved::graph_tag::update>.load(),
-            reserved::counter<reserved::graph_tag::update::success>.load(),
-            reserved::counter<reserved::graph_tag::update::failure>.load());
-#endif
   }
 
   void submit(cudaStream_t stream = nullptr)
@@ -394,10 +354,6 @@ public:
 
     cuda_try(cudaGraphLaunch(*state.exec_graph, state.submitted_stream));
 
-#ifdef CUDASTF_DEBUG
-    reserved::counter<reserved::graph_tag::launch>.increment();
-#endif
-
     // Note that we comment this out for now, so that it is possible to use
     // the print_to_dot method; but we may perhaps discard this graph to
     // some dedicated member variable.
@@ -614,10 +570,6 @@ private:
 
     cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0));
 
-#ifdef CUDASTF_DEBUG
-    reserved::counter<reserved::graph_tag::instantiate>.increment();
-#endif
-
     return res;
   }
 
@@ -712,10 +664,6 @@ private:
 
     cuda_try(cudaGraphLaunch(local_exec_graph, state.submitted_stream));
 
-#ifdef CUDASTF_DEBUG
-    reserved::counter<reserved::graph_tag::launch>.increment();
-#endif
-
     return state.submitted_stream;
   }
 
diff --git a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh
index f6e79500d11..aba0c3e3735 100644
--- a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh
@@ -433,15 +433,6 @@ inline event_list event_impl::from_stream(backend_ctx_untyped&, cudaStream_t) co
 }
 _CCCL_DIAG_POP
 
-namespace reserved
-{
-
-// For counters
-class join_tag
-{};
-
-} // end namespace reserved
-
 /**
  * @brief Introduce a dependency from all entries of an event list to an event.
 
@@ -474,7 +465,6 @@ void join(context_t& ctx, some_event& to, event_list& prereq_in)
     {
       from = static_cast<some_event*>(item.operator->());
     }
-    reserved::counter<reserved::join_tag>::increment();
     to.insert_dep(ctx.async_resources(), *from);
     from->outbound_deps++;
   }
diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
index 56f4a719a4b..06fed8c7ff7 100644
--- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
@@ -554,11 +554,8 @@ protected:
         reserved::dot::instance().is_timing());
 
       // We generate symbols if we may use them
-#ifdef CUDASTF_DEBUG
-      generate_event_symbols = true;
-#else
       generate_event_symbols = dot->is_tracing_prereqs();
-#endif
+
       // Record it in the list of all traced contexts
       reserved::dot::instance().per_ctx.push_back(dot);
     }
@@ -575,8 +572,6 @@ protected:
       }
 
       display_transfers();
-
-      fprintf(stderr, "TOTAL SYNC COUNT: %lu\n", reserved::counter<reserved::join_tag>::load());
     }
 
     impl(const impl&)            = delete;
diff --git a/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh b/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh
index 9b60beb6345..40d2b1bf30d 100644
--- a/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh
@@ -54,18 +54,6 @@ inline bool try_updating_executable_graph(cudaGraphExec_t exec_graph, cudaGraph_
   // Be sure to "erase" the last error
   cudaError_t res = cudaGetLastError();
 
-#ifdef CUDASTF_DEBUG
-  reserved::counter<reserved::graph_tag::update>.increment();
-  if (res == cudaSuccess)
-  {
-    reserved::counter<reserved::graph_tag::update::success>.increment();
-  }
-  else
-  {
-    reserved::counter<reserved::graph_tag::update::failure>.increment();
-  }
-#endif
-
   return (res == cudaSuccess);
 }
 
@@ -81,10 +69,6 @@ inline ::std::shared_ptr<cudaGraphExec_t> graph_instantiate(cudaGraph_t g)
 
   cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0));
 
-#ifdef CUDASTF_DEBUG
-  reserved::counter<reserved::graph_tag::instantiate>.increment();
-#endif
-
   return res;
 }
 
diff --git a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh
index e68dca7d0bf..b2f7320192f 100644
--- a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh
@@ -42,21 +42,6 @@ inline event join_with_stream(
 
 using stream_and_event_vector = small_vector<reserved::handle<stream_and_event>, 7>;
 
-/* Tag types for event counters */
-class cuda_event_tag
-{
-public:
-  class created
-  {};
-  class alive
-  {};
-  class destroyed
-  {};
-};
-
-class cuda_stream_wait_event_tag
-{};
-
 } // namespace reserved
 
 /* This event type allows to synchronize a CUDA stream with a CUDA event in
@@ -77,10 +62,6 @@ protected:
     if (cudaEvent)
     {
       cuda_safe_call(cudaEventDestroy(cudaEvent));
-#ifdef CUDASTF_DEBUG
-      reserved::counter<reserved::cuda_event_tag::destroyed>::increment();
-      reserved::counter<reserved::cuda_event_tag::alive>::decrement();
-#endif
 
       //            fprintf(stderr, "DESTROY EVENT %p #%d (created %d)\n", event, ++destroyed_event_cnt,
       //            event_cnt);
@@ -129,27 +110,13 @@ public:
       // Disable timing to avoid implicit barriers
       cudaEvent_t sync_event;
       cuda_safe_call(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming));
-#ifdef CUDASTF_DEBUG
-      reserved::counter<reserved::cuda_event_tag::created>::increment();
-      reserved::counter<reserved::cuda_event_tag::alive>::increment();
-      reserved::high_water_mark<reserved::cuda_event_tag::alive>::record(
-        reserved::counter<cuda_event_tag::alive>::load());
-#endif
-
       cuda_safe_call(cudaEventRecord(sync_event, s2));
 
       // According to documentation "event may be from a different device than stream."
       cuda_safe_call(cudaStreamWaitEvent(s1, sync_event, 0));
-#ifdef CUDASTF_DEBUG
-      reserved::counter<reserved::cuda_stream_wait_event_tag>.increment();
-#endif
 
       // Asynchronously destroy event to avoid a memleak
       cuda_safe_call(cudaEventDestroy(sync_event));
-#ifdef CUDASTF_DEBUG
-      reserved::counter<reserved::cuda_event_tag::destroyed>::increment();
-      reserved::counter<reserved::cuda_event_tag::alive>::decrement();
-#endif
     };
   }
 
@@ -167,12 +134,6 @@ public:
       cuda_safe_call(cudaEventCreateWithFlags(&cudaEvent, cudaEventDisableTiming));
       // fprintf(stderr, "CREATE EVENT %p %s\n", cudaEvent, get_symbol().c_str());
       assert(cudaEvent);
-#ifdef CUDASTF_DEBUG
-      reserved::counter<reserved::cuda_event_tag::created>::increment();
-      reserved::counter<reserved::cuda_event_tag::alive>::increment();
-      reserved::high_water_mark<reserved::cuda_event_tag::alive>::record(
-        reserved::counter<cuda_event_tag::alive>::load());
-#endif
       cuda_safe_call(cudaEventRecord(cudaEvent, dstream.stream));
     };
   }
@@ -186,9 +147,6 @@ public:
       if (!skip)
       {
         cuda_safe_call(cudaStreamWaitEvent(dstream.stream, from.cudaEvent, 0));
-#ifdef CUDASTF_DEBUG
-        reserved::counter<reserved::cuda_stream_wait_event_tag>.increment();
-#endif
       }
     }
   }
@@ -415,9 +373,6 @@ private:
         if (!skip)
         {
           cuda_safe_call(cudaStreamWaitEvent(dstream.stream, se->get_cuda_event(), 0));
-#ifdef CUDASTF_DEBUG
-          reserved::counter<reserved::cuda_stream_wait_event_tag>.increment();
-#endif
         }
       }
       se->outbound_deps++;
diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh
index bb846ed65cb..c515233811b 100644
--- a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh
@@ -36,6 +36,7 @@
 #include <cuda/experimental/__stf/stream/interfaces/slice.cuh> // For implicit logical_data_untyped constructors
 #include <cuda/experimental/__stf/stream/interfaces/void_interface.cuh>
 #include <cuda/experimental/__stf/stream/stream_task.cuh>
+#include <cuda/experimental/__stf/utility/threads.cuh> // for reserved::counter
 
 namespace cuda::experimental::stf
 {
@@ -516,38 +517,6 @@ public:
     }
     state.cleanup();
     set_phase(backend_ctx_untyped::phase::finalized);
-
-#ifdef CUDASTF_DEBUG
-    // Ensure that the total number of CUDA events created corresponds to
-    // the number of events destroyed
-    const auto alive = reserved::counter<reserved::cuda_event_tag::alive>.load();
-    if (alive != 0)
-    {
-      fprintf(stderr,
-              "WARNING!!! %lu CUDA events leaked (approx %lu created vs. %lu destroyed).\n",
-              alive,
-              reserved::counter<reserved::cuda_event_tag::created>.load(),
-              reserved::counter<reserved::cuda_event_tag::destroyed>.load());
-    }
-
-    assert(alive == 0);
-
-    const char* display_stats_env = getenv("CUDASTF_DISPLAY_STATS");
-    if (!display_stats_env || atoi(display_stats_env) == 0)
-    {
-      return;
-    }
-
-    fprintf(stderr,
-            "[STATS CUDA EVENTS] created=%lu destroyed=%lu alive=%lu reserved::high_water_mark=%lu\n",
-            reserved::counter<reserved::cuda_event_tag::created>.load(),
-            reserved::counter<reserved::cuda_event_tag::destroyed>.load(),
-            alive,
-            reserved::high_water_mark<reserved::cuda_event_tag>.load());
-    fprintf(stderr,
-            "[STATS CUDA EVENTS] cuda_stream_wait_event=%lu\n",
-            reserved::counter<reserved::cuda_stream_wait_event_tag>.load());
-#endif
   }
 
   float get_submission_time_ms() const
diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
index 348136778b7..d1f51751dd0 100644
--- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
@@ -415,10 +415,6 @@ private:
     cudaEvent_t sync_event;
     // Disable timing to avoid implicit barriers
     cuda_safe_call(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming));
-#ifdef CUDASTF_DEBUG
-    reserved::counter<reserved::cuda_event_tag::created> ++;
-    reserved::high_water_mark<reserved::cuda_event_tag>.record(++reserved::counter<cuda_event_tag::alive>);
-#endif
 
     cuda_safe_call(cudaEventRecord(sync_event, streams[0].stream));
 
@@ -430,10 +426,6 @@ private:
 
     // Asynchronously destroy event to avoid a memleak
     cuda_safe_call(cudaEventDestroy(sync_event));
-#ifdef CUDASTF_DEBUG
-    reserved::counter<reserved::cuda_event_tag::destroyed>.increment();
-    reserved::counter<reserved::cuda_event_tag::alive>.decrement();
-#endif
 
     if (current_dev != s0_dev)
     {