Devsh-Graphics-Programming · devshgraphicsprogramming · Jan 16, 2025 · Nov 11, 2024 · Nov 12, 2024 · Nov 13, 2024
diff --git a/examples_tests b/examples_tests
diff --git a/include/nbl/builtin/hlsl/complex.hlsl b/include/nbl/builtin/hlsl/complex.hlsl
@@ -379,6 +379,19 @@ complex_t<Scalar> rotateRight(NBL_CONST_REF_ARG(complex_t<Scalar>) value)
     return retVal;
 }
 
+// Annoyed at having to write a lot of boilerplate to do a select
+// Essentially returns what you'd expect from doing `condition ? a : b`
+template<typename Scalar>
+complex_t<Scalar> ternaryOperator(bool condition, NBL_CONST_REF_ARG(complex_t<Scalar>) a, NBL_CONST_REF_ARG(complex_t<Scalar>) b) 
+{
+    const vector<Scalar, 2> aVector = vector<Scalar, 2>(a.real(), a.imag());
+    const vector<Scalar, 2> bVector = vector<Scalar, 2>(b.real(), b.imag());
+    const vector<Scalar, 2> resultVector = condition ? aVector : bVector;
+    const complex_t<Scalar> result = { resultVector.x, resultVector.y };
+    return result;
+}
+
+
 }
 }
 

diff --git a/include/nbl/builtin/hlsl/fft/common.hlsl b/include/nbl/builtin/hlsl/fft/common.hlsl
@@ -1,9 +1,47 @@
 #ifndef _NBL_BUILTIN_HLSL_FFT_COMMON_INCLUDED_
 #define _NBL_BUILTIN_HLSL_FFT_COMMON_INCLUDED_
 
-#include "nbl/builtin/hlsl/complex.hlsl"
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+#ifndef __HLSL_VERSION
+#include <nbl/core/math/intutil.h>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace fft
+{
+
+static inline uint32_t3 padDimensions(uint32_t3 dimensions, std::span<uint16_t> axes, bool realFFT = false)
+{
+    uint16_t axisCount = 0;
+    for (auto i : axes)
+    {
+        dimensions[i] = core::roundUpToPoT(dimensions[i]);
+        if (realFFT && !axisCount++)
+            dimensions[i] /= 2;
+    }
+    return dimensions;
+}
+
+static inline uint64_t getOutputBufferSize(const uint32_t3& inputDimensions, uint32_t numChannels, std::span<uint16_t> axes, bool realFFT = false, bool halfFloats = false)
+{
+    auto paddedDims = padDimensions(inputDimensions, axes);
+    uint64_t numberOfComplexElements = paddedDims[0] * paddedDims[1] * paddedDims[2] * numChannels;
+    return 2 * numberOfComplexElements * (halfFloats ? sizeof(float16_t) : sizeof(float32_t));
+}
+
+
+}
+}
+}
+
+#else
+
+#include "nbl/builtin/hlsl/complex.hlsl"
 #include "nbl/builtin/hlsl/numbers.hlsl"
+#include "nbl/builtin/hlsl/concepts.hlsl"
 
 namespace nbl 
 {
@@ -53,8 +91,29 @@ using DIT = DIX<true, Scalar>;
 
 template<typename Scalar>
 using DIF = DIX<false, Scalar>;
+
+// ------------------------------------------------- Utils ---------------------------------------------------------
+// 
+// Util to unpack two values from the packed FFT X + iY - get outputs in the same input arguments, storing x to lo and y to hi
+template<typename Scalar>
+void unpack(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)
+{
+    complex_t<Scalar> x = (lo + conj(hi)) * Scalar(0.5);
+    hi = rotateRight<Scalar>(lo - conj(hi)) * Scalar(0.5);
+    lo = x;
 }
+
+// Bit-reverses T as a binary string of length given by Bits
+template<typename T, uint16_t Bits NBL_FUNC_REQUIRES(is_integral_v<T> && Bits <= sizeof(T) * 8)
+T bitReverse(T value)
+{
+    return glsl::bitfieldReverse<uint32_t>(value) >> (sizeof(T) * 8 - Bits);
 }
+
 }
+}
+}
+
+#endif
 
 #endif
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -1,13 +1,45 @@
 #ifndef _NBL_BUILTIN_HLSL_WORKGROUP_FFT_INCLUDED_
 #define _NBL_BUILTIN_HLSL_WORKGROUP_FFT_INCLUDED_
 
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/fft/common.hlsl>
+
+#ifndef __HLSL_VERSION
+#include <nbl/video/IPhysicalDevice.h>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace workgroup
+{
+namespace fft
+{
+
+inline std::pair<uint16_t, uint16_t> optimalFFTParameters(const video::ILogicalDevice* device, uint32_t inputArrayLength)
+{
+    uint32_t maxWorkgroupSize = *device->getPhysicalDevice()->getLimits().maxWorkgroupSize;
+    // This is the logic found in core::roundUpToPoT to get the log2
+    uint16_t workgroupSizeLog2 = 1u + hlsl::findMSB(core::min(inputArrayLength / 2, maxWorkgroupSize) - 1u);
+    uint16_t elementPerInvocationLog2 = 1u + hlsl::findMSB(core::max((inputArrayLength >> workgroupSizeLog2) - 1u, 1u));
+    return { elementPerInvocationLog2, workgroupSizeLog2 };
+}
+
+}
+}
+}
+}
+
+#else
+
 #include "nbl/builtin/hlsl/subgroup/fft.hlsl"
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/workgroup/shuffle.hlsl"
 #include "nbl/builtin/hlsl/mpl.hlsl"
 #include "nbl/builtin/hlsl/memory_accessor.hlsl"
 #include "nbl/builtin/hlsl/bit.hlsl"
+#include "nbl/builtin/hlsl/concepts.hlsl"
 
 // Caveats
 // - Sin and Cos in HLSL take 32-bit floats. Using this library with 64-bit floats works perfectly fine, but DXC will emit warnings
@@ -90,20 +122,7 @@ namespace impl
     }
 } //namespace impl
 
-// Get the required size (in number of uint32_t elements) of the workgroup shared memory array needed for the FFT
-template <typename scalar_t, uint16_t WorkgroupSize>
-NBL_CONSTEXPR uint32_t SharedMemoryDWORDs = (sizeof(complex_t<scalar_t>) / sizeof(uint32_t))  * WorkgroupSize;
-
-// Util to unpack two values from the packed FFT X + iY - get outputs in the same input arguments, storing x to lo and y to hi
-template<typename Scalar>
-void unpack(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)
-{
-    complex_t<Scalar> x = (lo + conj(hi)) * Scalar(0.5);
-    hi = rotateRight<Scalar>(lo - conj(hi)) * Scalar(0.5);
-    lo = x;
-}
-
-template<uint16_t ElementsPerInvocation, uint16_t WorkgroupSize>
+template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2>
 struct FFTIndexingUtils
 {
     // This function maps the index `idx` in the output array of a Nabla FFT to the index `freqIdx` in the DFT such that `DFT[freqIdx] = NablaFFT[idx]`
@@ -132,16 +151,36 @@ struct FFTIndexingUtils
         return getNablaIndex(getDFTMirrorIndex(getDFTIndex(idx)));
     }
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = mpl::log2<ElementsPerInvocation>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t FFTSizeLog2 = ElementsPerInvocationLog2 + mpl::log2<WorkgroupSize>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t FFTSize = uint32_t(WorkgroupSize) * uint32_t(ElementsPerInvocation);
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t FFTSizeLog2 = ElementsPerInvocationLog2 + WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t FFTSize = uint32_t(1) << FFTSizeLog2;
 };
 
 } //namespace fft
 
-// ----------------------------------- End Utils -----------------------------------------------
+// ----------------------------------- End Utils --------------------------------------------------------------
 
-template<uint16_t ElementsPerInvocation, bool Inverse, uint16_t WorkgroupSize, typename Scalar, class device_capabilities=void>
+namespace fft
+{
+
+template<uint16_t _ElementsPerInvocationLog2, uint16_t _WorkgroupSizeLog2, typename _Scalar NBL_PRIMARY_REQUIRES(_ElementsPerInvocationLog2 > 0 && _WorkgroupSizeLog2 >= 5)
+struct ConstevalParameters
+{
+    using scalar_t = _Scalar;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = _ElementsPerInvocationLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t TotalSize = uint32_t(1) << (ElementsPerInvocationLog2 + WorkgroupSizeLog2);
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = uint16_t(1) << ElementsPerInvocationLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1) << WorkgroupSizeLog2;
+
+    // Required size (in number of uint32_t elements) of the workgroup shared memory array needed for the FFT
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = (sizeof(complex_t<scalar_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2;
+};
+
+} //namespace fft
+
+template<bool Inverse, typename consteval_params_t, class device_capabilities=void>
 struct FFT;
 
 // For the FFT methods below, we assume:
@@ -161,9 +200,11 @@ struct FFT;
 //             * void workgroupExecutionAndMemoryBarrier();
 
 // 2 items per invocation forward specialization
-template<uint16_t WorkgroupSize, typename Scalar, class device_capabilities>
-struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities>
+template<uint16_t WorkgroupSizeLog2, typename Scalar, class device_capabilities>
+struct FFT<false, fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>, device_capabilities>
 {
+    using consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>;
+
     template<typename SharedMemoryAdaptor>
     static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
     {
@@ -177,6 +218,8 @@ struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities>
     template<typename Accessor, typename SharedMemoryAccessor>
     static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize;
+
         // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
 		const uint32_t loIx = threadID;
@@ -222,12 +265,12 @@ struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities>
     }
 };
 
-
-
 // 2 items per invocation inverse specialization
-template<uint16_t WorkgroupSize, typename Scalar, class device_capabilities>
-struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities>
+template<uint16_t WorkgroupSizeLog2, typename Scalar, class device_capabilities>
+struct FFT<true, fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>, device_capabilities>
 {
+    using consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>;
+
     template<typename SharedMemoryAdaptor>
     static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
     {
@@ -241,6 +284,8 @@ struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities>
     template<typename Accessor, typename SharedMemoryAccessor>
     static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize;
+
         // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
         const uint32_t loIx = threadID;
@@ -291,17 +336,23 @@ struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities>
 };
 
 // Forward FFT
-template<uint32_t K, uint16_t WorkgroupSize, typename Scalar, class device_capabilities>
-struct FFT<K, false, WorkgroupSize, Scalar, device_capabilities>
+template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2, typename Scalar, class device_capabilities>
+struct FFT<false, fft::ConstevalParameters<ElementsPerInvocationLog2, WorkgroupSizeLog2, Scalar>, device_capabilities>
 {
+    using consteval_params_t = fft::ConstevalParameters<ElementsPerInvocationLog2, WorkgroupSizeLog2, Scalar>;
+    using small_fft_consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>;
+
     template<typename Accessor, typename SharedMemoryAccessor>
-    static enable_if_t< (mpl::is_pot_v<K> && K > 2), void > __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
+    static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize;
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = consteval_params_t::ElementsPerInvocation;
+
         [unroll]
-        for (uint32_t stride = (K / 2) * WorkgroupSize; stride > WorkgroupSize; stride >>= 1)
+        for (uint32_t stride = (ElementsPerInvocation / 2) * WorkgroupSize; stride > WorkgroupSize; stride >>= 1)
         {
             [unroll]
-            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
+            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (ElementsPerInvocation / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
             {
                 const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1));
                 const uint32_t hiIx = loIx | stride;
@@ -318,47 +369,53 @@ struct FFT<K, false, WorkgroupSize, Scalar, device_capabilities>
             accessor.memoryBarrier(); // no execution barrier just making sure writes propagate to accessor
         }
 
-        // do K/2 small workgroup FFTs
+        // do ElementsPerInvocation/2 small workgroup FFTs
         accessor_adaptors::Offset<Accessor> offsetAccessor;
         offsetAccessor.accessor = accessor;
         [unroll]
-        for (uint32_t k = 0; k < K; k += 2)
+        for (uint32_t k = 0; k < ElementsPerInvocation; k += 2)
         {
             if (k)
                 sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
             offsetAccessor.offset = WorkgroupSize*k;
-            FFT<2,false, WorkgroupSize, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
+            FFT<false, small_fft_consteval_params_t, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
         }
         accessor = offsetAccessor.accessor;
     }
 };
 
 // Inverse FFT
-template<uint32_t K, uint16_t WorkgroupSize, typename Scalar, class device_capabilities>
-struct FFT<K, true, WorkgroupSize, Scalar, device_capabilities>
+template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2, typename Scalar, class device_capabilities>
+struct FFT<true, fft::ConstevalParameters<ElementsPerInvocationLog2, WorkgroupSizeLog2, Scalar>, device_capabilities>
 {
+    using consteval_params_t = fft::ConstevalParameters<ElementsPerInvocationLog2, WorkgroupSizeLog2, Scalar>;
+    using small_fft_consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>;
+
     template<typename Accessor, typename SharedMemoryAccessor>
-    static enable_if_t< (mpl::is_pot_v<K> && K > 2), void > __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
+    static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize;
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = consteval_params_t::ElementsPerInvocation;
+
         // do K/2 small workgroup FFTs
         accessor_adaptors::Offset<Accessor> offsetAccessor;
         offsetAccessor.accessor = accessor;
         [unroll]
-        for (uint32_t k = 0; k < K; k += 2)
+        for (uint32_t k = 0; k < ElementsPerInvocation; k += 2)
         {
             if (k)
                 sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
             offsetAccessor.offset = WorkgroupSize*k;
-            FFT<2,true, WorkgroupSize, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
+            FFT<true, small_fft_consteval_params_t, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
         }
         accessor = offsetAccessor.accessor;
 
         [unroll]
-        for (uint32_t stride = 2 * WorkgroupSize; stride < K * WorkgroupSize; stride <<= 1)
+        for (uint32_t stride = 2 * WorkgroupSize; stride < ElementsPerInvocation * WorkgroupSize; stride <<= 1)
         {
             accessor.memoryBarrier(); // no execution barrier just making sure writes propagate to accessor
             [unroll]
-            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
+            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (ElementsPerInvocation / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
             {
                 const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1));
                 const uint32_t hiIx = loIx | stride;
@@ -370,11 +427,11 @@ struct FFT<K, true, WorkgroupSize, Scalar, device_capabilities>
                 hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true,Scalar>(virtualThreadID & (stride - 1), stride), lo,hi);
 
                 // Divide by special factor at the end
-                if ( (K / 2) * WorkgroupSize == stride)
+                if ( (ElementsPerInvocation / 2) * WorkgroupSize == stride)
                 {
                     divides_assign< complex_t<Scalar> > divAss;
-                    divAss(lo, K / 2);
-                    divAss(hi, K / 2);  
+                    divAss(lo, ElementsPerInvocation / 2);
+                    divAss(hi, ElementsPerInvocation / 2);  
                 }
 
                 accessor.set(loIx, lo);
@@ -390,4 +447,7 @@ struct FFT<K, true, WorkgroupSize, Scalar, device_capabilities>
 }
 }
 
+
+#endif
+
 #endif
+24 −0		11_FFTBloom/CMakeLists.txt
+68 −0		11_FFTBloom/README.md
+37 −0		11_FFTBloom/app_resources/common.hlsl
+54 −0		11_FFTBloom/app_resources/fft_common.hlsl
+169 −0		11_FFTBloom/app_resources/fft_convolve_ifft.hlsl
+35 −0		11_FFTBloom/app_resources/fft_mirror_common.hlsl
+117 −0		11_FFTBloom/app_resources/image_fft_first_axis.hlsl
+123 −0		11_FFTBloom/app_resources/image_ifft_first_axis.hlsl
+95 −0		11_FFTBloom/app_resources/kernel_fft_first_axis.hlsl
+121 −0		11_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
+110 −0		11_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl
+28 −0		11_FFTBloom/config.json.template
+1,171 −0		11_FFTBloom/main.cpp
+50 −0		11_FFTBloom/pipeline.groovy
+24 −0		64_FFT/CMakeLists.txt
+14 −0		64_FFT/app_resources/common.hlsl
+64 −0		64_FFT/app_resources/shader.comp.hlsl
+28 −0		64_FFT/config.json.template
+352 −0		64_FFT/main.cpp
+50 −0		64_FFT/pipeline.groovy
+4 −0		CMakeLists.txt