diff --git a/c2h/include/c2h/generators.h b/c2h/include/c2h/generators.h
index 20036088fa8..62f169e9e21 100644
--- a/c2h/include/c2h/generators.h
+++ b/c2h/include/c2h/generators.h
@@ -35,7 +35,24 @@
 #include <c2h/vector.h>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#  include <cub/util_type.cuh> // for <cuda_fp8.h>
+#  if defined(_CCCL_HAS_NVFP16)
+#    include <cuda_fp16.h>
+#  endif // _CCCL_HAS_NVFP16
+
+#  if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#    include <cuda_bf16.h>
+_CCCL_DIAG_POP
+
+#    if _CCCL_CUDACC_AT_LEAST(11, 8)
+// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
+_CCCL_DIAG_PUSH
+#      include <cuda_fp8.h>
+_CCCL_DIAG_POP
+#    endif // _CCCL_CUDACC_AT_LEAST(11, 8)
+#  endif // _CCCL_HAS_NVBF16
+
 #  if defined(__CUDA_FP8_TYPES_EXIST__)
 namespace std
 {
diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh
index aa2ffd371c0..24b3204801d 100644
--- a/cub/cub/detail/fast_modulo_division.cuh
+++ b/cub/cub/detail/fast_modulo_division.cuh
@@ -37,6 +37,9 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/type_traits.cuh> // implicit_prom_t
+#include <cub/util_type.cuh> // CUB_IS_INT128_ENABLED
+
 #include <cuda/cmath> // cuda::std::ceil_div
 #include <cuda/std/bit> // std::has_single_bit
 #include <cuda/std/climits> // CHAR_BIT
@@ -44,9 +47,6 @@
 #include <cuda/std/limits> // numeric_limits
 #include <cuda/std/type_traits> // std::is_integral
 
-#include "cub/detail/type_traits.cuh" // implicit_prom_t
-#include "cub/util_type.cuh" // CUB_IS_INT128_ENABLED
-
 #if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
 _CCCL_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
 #endif // CCCL_ENABLE_DEVICE_ASSERTIONS
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 45d2446188f..05f2d6a41f6 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -56,6 +56,17 @@
 #include <cuda/std/type_traits> // cuda::std::common_type
 #include <cuda/std/utility> // cuda::std::forward
 
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _CCCL_HAS_NVFP16
+
+#if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _CCCL_HAS_NVFP16
+
 CUB_NAMESPACE_BEGIN
 
 // TODO(bgruber): deprecate in C++17 with a note: "replace by decltype(cuda::std::not_fn(EqualityOp{}))"
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index d4b4a89fdfd..f384d907b34 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -54,6 +54,17 @@
 #include <cuda/std/cstdint> // uint16_t
 #include <cuda/std/functional> // cuda::std::plus
 
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _CCCL_HAS_NVFP16
+
+#if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _CCCL_HAS_NVFP16
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index f062ebc4ae9..5bda9dfe98f 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -50,7 +50,16 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _CCCL_HAS_NVFP16
+
 #if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+
 #  if _CCCL_CUDACC_AT_LEAST(11, 8)
 // cuda_fp8.h resets default for C4127, so we have to guard the inclusion
 _CCCL_DIAG_PUSH
diff --git a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h
index 9d3c835c464..d135f406702 100644
--- a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h
+++ b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h
@@ -39,15 +39,4 @@
 #  endif
 #endif // !_CCCL_HAS_NVBF16
 
-#if defined(_CCCL_HAS_NVFP16)
-#  include <cuda_fp16.h>
-#endif // _CCCL_HAS_NVFP16
-
-#if defined(_CCCL_HAS_NVBF16)
-_CCCL_DIAG_PUSH
-_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
-#  include <cuda_bf16.h>
-_CCCL_DIAG_POP
-#endif // _CCCL_HAS_NVFP16
-
 #endif // __CCCL_EXTENDED_FLOATING_POINT_H
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
index dcc4330e107..bb1afa4225b 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
@@ -22,6 +22,17 @@
 
 #include <cuda/std/__type_traits/integral_constant.h>
 
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _LIBCUDACXX_HAS_NVBF16
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
@@ -39,8 +50,6 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v
 #endif // !_CCCL_NO_VARIABLE_TEMPLATES
 
 #if defined(_LIBCUDACXX_HAS_NVFP16)
-#  include <cuda_fp16.h>
-
 template <>
 struct __is_extended_floating_point<__half> : true_type
 {};
@@ -52,11 +61,6 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__half> = true;
 #endif // _LIBCUDACXX_HAS_NVFP16
 
 #if defined(_LIBCUDACXX_HAS_NVBF16)
-_CCCL_DIAG_PUSH
-_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
-#  include <cuda_bf16.h>
-_CCCL_DIAG_POP
-
 template <>
 struct __is_extended_floating_point<__nv_bfloat16> : true_type
 {};
diff --git a/libcudacxx/include/cuda/std/__type_traits/promote.h b/libcudacxx/include/cuda/std/__type_traits/promote.h
index 01b06989513..daa545c5fa1 100644
--- a/libcudacxx/include/cuda/std/__type_traits/promote.h
+++ b/libcudacxx/include/cuda/std/__type_traits/promote.h
@@ -28,6 +28,7 @@
 #ifdef _LIBCUDACXX_HAS_NVFP16
 #  include <cuda_fp16.h>
 #endif // _LIBCUDACXX_HAS_NVFP16
+
 #ifdef _LIBCUDACXX_HAS_NVBF16
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h
index a582cf2f3c6..3de3d5492f7 100644
--- a/thrust/thrust/system/cuda/detail/sort.h
+++ b/thrust/thrust/system/cuda/detail/sort.h
@@ -60,6 +60,17 @@
 
 #  include <cstdint>
 
+#  if defined(_CCCL_HAS_NVFP16)
+#    include <cuda_fp16.h>
+#  endif // _CCCL_HAS_NVFP16
+
+#  if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#    include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#  endif // _CCCL_HAS_NVBF16
+
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {