From e2d3d5744da116d0452cc34f68148fd09dbb0320 Mon Sep 17 00:00:00 2001 From: Dmitrii Tolmachev Date: Sun, 3 Mar 2024 00:06:21 +0100 Subject: [PATCH] single kernel - multiple batches convolution support (#159) -enabled through singleKernelMultipleBatches parameter -kernel batching is controlled through coordinateFeatures -number of input/output systems is controlled through numberBatches -sample 53 shows the usage of this option --- CMakeLists.txt | 2 + VkFFT_TestSuite.cpp | 7 + ...volution_VkFFT_single_2d_Nimages_1kernel.h | 4 + ...lution_VkFFT_single_2d_Nimages_1kernel.cpp | 327 ++++++++++++++++++ .../vkFFT_AppManagement/vkFFT_InitializeApp.h | 104 +++--- .../vkFFT_KernelsLevel1/vkFFT_ReadWrite.h | 2 +- .../vkFFT_Plans/vkFFT_Plan_FFT.h | 1 + .../vkFFT_Plans/vkFFT_Plan_R2C.h | 2 + vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h | 2 + 9 files changed, 399 insertions(+), 52 deletions(-) create mode 100644 benchmark_scripts/vkFFT_scripts/include/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.h create mode 100644 benchmark_scripts/vkFFT_scripts/src/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index df7ef193..092c79ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,7 @@ if(build_VkFFT_FFTW_precision) benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp + benchmark_scripts/vkFFT_scripts/src/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.cpp benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp @@ -85,6 +86,7 @@ else() benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp + benchmark_scripts/vkFFT_scripts/src/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.cpp benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp diff --git a/VkFFT_TestSuite.cpp b/VkFFT_TestSuite.cpp index ed35ec1f..39ffb271 100644 --- a/VkFFT_TestSuite.cpp +++ b/VkFFT_TestSuite.cpp @@ -82,6 +82,7 @@ #include "sample_50_convolution_VkFFT_single_1d_matrix.h" #include "sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.h" #include "sample_52_convolution_VkFFT_single_2d_batched_r2c.h" +#include "sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.h" #include "sample_100_benchmark_VkFFT_single_nd_dct.h" #include "sample_101_benchmark_VkFFT_double_nd_dct.h" @@ -423,6 +424,11 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE { resFFT = sample_52_convolution_VkFFT_single_2d_batched_r2c(vkGPU, file_output, output, isCompilerInitialized); break; + } + case 53: + { + resFFT = sample_53_convolution_VkFFT_single_2d_Nimages_1kernel(vkGPU, file_output, output, isCompilerInitialized); + break; } case 110: { @@ -638,6 +644,7 @@ int main(int argc, char* argv[]) printf(" 50 - convolution example with identity kernel\n"); printf(" 51 - zeropadding convolution example with identity kernel\n"); printf(" 52 - batched convolution example with identity kernel\n"); + printf(" 53 - convolution example with one scaling kernel of three colors, multiple images of three colors\n"); printf(" 110 - VkFFT FFT + iFFT R2R DCT-1 multidimensional benchmark in single precision\n"); printf(" 111 - VkFFT FFT + iFFT R2R DCT-1 multidimensional benchmark in double precision\n"); printf(" 120 - VkFFT FFT + iFFT R2R DCT-2 multidimensional benchmark in single precision\n"); diff --git a/benchmark_scripts/vkFFT_scripts/include/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.h b/benchmark_scripts/vkFFT_scripts/include/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.h new file mode 100644 index 00000000..12f1385f --- /dev/null +++ b/benchmark_scripts/vkFFT_scripts/include/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.h @@ -0,0 +1,4 @@ +#include "vkFFT.h" +#include "utils_VkFFT.h" + +VkFFTResult sample_53_convolution_VkFFT_single_2d_Nimages_1kernel(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized); \ No newline at end of file diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.cpp new file mode 100644 index 00000000..a3336163 --- /dev/null +++ b/benchmark_scripts/vkFFT_scripts/src/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.cpp @@ -0,0 +1,327 @@ +//general parts +#include +#include +#include +#include +#include +#include +#include +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif +#include + +#if(VKFFT_BACKEND==0) +#include "vulkan/vulkan.h" +#include "glslang/Include/glslang_c_interface.h" +#elif(VKFFT_BACKEND==1) +#include +#include +#include +#include +#include +#elif(VKFFT_BACKEND==2) +#ifndef __HIP_PLATFORM_HCC__ +#define __HIP_PLATFORM_HCC__ +#endif +#include +#include +#include +#include +#elif(VKFFT_BACKEND==3) +#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#endif +#ifdef __APPLE__ +#include +#else +#include +#endif +#elif(VKFFT_BACKEND==4) +#include +#elif(VKFFT_BACKEND==5) +#include "Foundation/Foundation.hpp" +#include "QuartzCore/QuartzCore.hpp" +#include "Metal/Metal.hpp" +#endif +#include "vkFFT.h" +#include "utils_VkFFT.h" + +VkFFTResult sample_53_convolution_VkFFT_single_2d_Nimages_1kernel(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized) +{ + VkFFTResult resFFT = VKFFT_SUCCESS; +#if(VKFFT_BACKEND==0) + VkResult res = VK_SUCCESS; +#elif(VKFFT_BACKEND==1) + cudaError_t res = cudaSuccess; +#elif(VKFFT_BACKEND==2) + hipError_t res = hipSuccess; +#elif(VKFFT_BACKEND==3) + cl_int res = CL_SUCCESS; +#elif(VKFFT_BACKEND==4) + ze_result_t res = ZE_RESULT_SUCCESS; +#elif(VKFFT_BACKEND==5) +#endif + if (file_output) + fprintf(output, "53 - VkFFT convolution example with one scaling kernel of three colors, multiple images of three colors\n"); + printf("53 - VkFFT convolution example with one scaling kernel of three colors, multiple images of three colors\n"); + //Configuration + FFT application. + VkFFTConfiguration configuration = {}; + VkFFTConfiguration convolution_configuration = {}; + VkFFTApplication app_convolution = {}; + VkFFTApplication app_kernel = {}; + //Convolution sample code + //Setting up FFT configuration. FFT is performed in-place with no performance loss. + + configuration.FFTdim = 2; //FFT dimension, 1D, 2D or 3D (default 1). + configuration.size[0] = 32; //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z. + configuration.size[1] = 32; + configuration.size[2] = 1; + + configuration.kernelConvolution = true; //specify if this plan is used to create kernel for convolution + configuration.coordinateFeatures = 3; //batching control parameter - for example, number of colors in the image and kernel + configuration.normalize = 1;//normalize iFFT + + //After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored. +#if(VKFFT_BACKEND==5) + configuration.device = vkGPU->device; +#else + configuration.device = &vkGPU->device; +#endif +#if(VKFFT_BACKEND==0) + configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers + configuration.fence = &vkGPU->fence; + configuration.commandPool = &vkGPU->commandPool; + configuration.physicalDevice = &vkGPU->physicalDevice; + configuration.isCompilerInitialized = isCompilerInitialized;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization +#elif(VKFFT_BACKEND==3) + configuration.context = &vkGPU->context; +#elif(VKFFT_BACKEND==4) + configuration.context = &vkGPU->context; + configuration.commandQueue = &vkGPU->commandQueue; + configuration.commandQueueID = vkGPU->commandQueueID; +#elif(VKFFT_BACKEND==5) + configuration.queue = vkGPU->queue; +#endif + + uint64_t kernelSize = ((uint64_t)configuration.coordinateFeatures) * sizeof(float) * 2 * configuration.size[0] * configuration.size[1] * configuration.size[2];; + +#if(VKFFT_BACKEND==0) + VkBuffer kernel = {}; + VkDeviceMemory kernelDeviceMemory = {}; + resFFT = allocateBuffer(vkGPU, &kernel, &kernelDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, kernelSize); + if (resFFT != VKFFT_SUCCESS) return resFFT; + configuration.buffer = &kernel; +#elif(VKFFT_BACKEND==1) + cuFloatComplex* kernel = 0; + res = cudaMalloc((void**)&kernel, kernelSize); + if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; + configuration.buffer = (void**)&kernel; +#elif(VKFFT_BACKEND==2) + hipFloatComplex* kernel = 0; + res = hipMalloc((void**)&kernel, kernelSize); + if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; + configuration.buffer = (void**)&kernel; +#elif(VKFFT_BACKEND==3) + cl_mem kernel = 0; + kernel = clCreateBuffer(vkGPU->context, CL_MEM_READ_WRITE, kernelSize, 0, &res); + if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; + configuration.buffer = &kernel; +#elif(VKFFT_BACKEND==4) + void* kernel = 0; + ze_device_mem_alloc_desc_t device_desc = {}; + device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + res = zeMemAllocDevice(vkGPU->context, &device_desc, kernelSize, sizeof(float), vkGPU->device, &kernel); + if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; + configuration.buffer = &kernel; +#elif(VKFFT_BACKEND==5) + MTL::Buffer* kernel = 0; + kernel = vkGPU->device->newBuffer(kernelSize, MTL::ResourceStorageModePrivate); + configuration.buffer = &kernel; +#endif + + configuration.bufferSize = &kernelSize; + + if (file_output) + fprintf(output, "Total memory needed for kernel: %" PRIu64 " MB\n", kernelSize / 1024 / 1024); + printf("Total memory needed for kernel: %" PRIu64 " MB\n", kernelSize / 1024 / 1024); + + //Fill kernel on CPU. + float* kernel_input = (float*)malloc(kernelSize); + if (!kernel_input) return VKFFT_ERROR_MALLOC_FAILED; + for (uint64_t v = 0; v < configuration.coordinateFeatures; v++) { + for (uint64_t k = 0; k < configuration.size[2]; k++) { + for (uint64_t j = 0; j < configuration.size[1]; j++) { + + //Below is the test identity kernel for 1x1 nonsymmetric FFT, multiplied by (f * configuration.coordinateFeatures + v + 1); + for (uint64_t i = 0; i < configuration.size[0]; i++) { + + kernel_input[2 * (i + j * configuration.size[0] + k * configuration.size[0] * configuration.size[1] + v * configuration.size[0] * configuration.size[1] * configuration.size[2])] = (float)(v + 1.0); + kernel_input[2 * (i + j * configuration.size[0] + k * configuration.size[0] * configuration.size[1] + v * configuration.size[0] * configuration.size[1] * configuration.size[2]) + 1] = 0; + + } + } + } + } + //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers. + resFFT = transferDataFromCPU(vkGPU, kernel_input, &kernel, kernelSize); + if (resFFT != VKFFT_SUCCESS) return resFFT; + //Initialize application responsible for the kernel. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library. + resFFT = initializeVkFFT(&app_kernel, configuration); + if (resFFT != VKFFT_SUCCESS) return resFFT; + //Sample forward FFT command buffer allocation + execution performed on kernel. Second number determines how many times perform application in one submit. FFT can also be appended to user defined command buffers. + + //Uncomment the line below if you want to perform kernel FFT. In this sample we use predefined identitiy kernel. + //performVulkanFFT(vkGPU, &app_kernel, -1, 1); + + //The kernel has been trasnformed. + + + //2. Buffer convolution with transformed kernel. + //Copy configuration, as it mostly remains unchanged. Change specific parts. + convolution_configuration = configuration; + convolution_configuration.kernelConvolution = false; + convolution_configuration.performConvolution = true; + +#if(VKFFT_BACKEND==0) + convolution_configuration.kernel = &kernel; +#elif(VKFFT_BACKEND==1) + convolution_configuration.kernel = (void**)&kernel; +#elif(VKFFT_BACKEND==2) + convolution_configuration.kernel = (void**)&kernel; +#elif(VKFFT_BACKEND==3) + convolution_configuration.kernel = &kernel; +#elif(VKFFT_BACKEND==4) + convolution_configuration.kernel = (void**)&kernel; +#elif(VKFFT_BACKEND==5) + convolution_configuration.kernel = &kernel; +#endif + + convolution_configuration.kernelSize = &kernelSize; + convolution_configuration.numberBatches = 3; + convolution_configuration.singleKernelMultipleBatches = true; + //Allocate separate buffer for the input data. + uint64_t bufferSize = convolution_configuration.numberBatches * convolution_configuration.coordinateFeatures * sizeof(float) * 2 * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2];; + +#if(VKFFT_BACKEND==0) + VkBuffer buffer = {}; + VkDeviceMemory bufferDeviceMemory = {}; + resFFT = allocateBuffer(vkGPU, &buffer, &bufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); + if (resFFT != VKFFT_SUCCESS) return resFFT; + convolution_configuration.buffer = &buffer; +#elif(VKFFT_BACKEND==1) + cuFloatComplex* buffer = 0; + res = cudaMalloc((void**)&buffer, bufferSize); + if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; + convolution_configuration.buffer = (void**)&buffer; +#elif(VKFFT_BACKEND==2) + hipFloatComplex* buffer = 0; + res = hipMalloc((void**)&buffer, bufferSize); + if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; + convolution_configuration.buffer = (void**)&buffer; +#elif(VKFFT_BACKEND==3) + cl_mem buffer = 0; + buffer = clCreateBuffer(vkGPU->context, CL_MEM_READ_WRITE, bufferSize, 0, &res); + if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; + convolution_configuration.buffer = &buffer; +#elif(VKFFT_BACKEND==4) + void* buffer = 0; + res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer); + if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; + convolution_configuration.buffer = &buffer; +#elif(VKFFT_BACKEND==5) + MTL::Buffer* buffer = 0; + buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate); + convolution_configuration.buffer = &buffer; +#endif + + convolution_configuration.bufferSize = &bufferSize; + + + if (file_output) + fprintf(output, "Total memory needed for buffer: %" PRIu64 " MB\n", bufferSize / 1024 / 1024); + printf("Total memory needed for buffer: %" PRIu64 " MB\n", bufferSize / 1024 / 1024); + //Fill data on CPU. It is best to perform all operations on GPU after initial upload. + float* buffer_input = (float*)malloc(bufferSize); + if (!buffer_input) return VKFFT_ERROR_MALLOC_FAILED; + for (uint64_t f = 0; f < convolution_configuration.numberBatches; f++) { + for (uint64_t v = 0; v < convolution_configuration.coordinateFeatures; v++) { + for (uint64_t k = 0; k < convolution_configuration.size[2]; k++) { + for (uint64_t j = 0; j < convolution_configuration.size[1]; j++) { + for (uint64_t i = 0; i < convolution_configuration.size[0]; i++) { + buffer_input[2 * (i + j * convolution_configuration.size[0] + k * convolution_configuration.size[0] * convolution_configuration.size[1] + v * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] * convolution_configuration.coordinateFeatures)] = (float)(f * configuration.coordinateFeatures + v + 1.0); + buffer_input[2 * (i + j * convolution_configuration.size[0] + k * convolution_configuration.size[0] * convolution_configuration.size[1] + v * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] * convolution_configuration.coordinateFeatures) + 1] = 0; + } + } + } + } + } + //Transfer data to GPU using staging buffer. + resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize); + if (resFFT != VKFFT_SUCCESS) return resFFT; + + //Initialize application responsible for the convolution. + resFFT = initializeVkFFT(&app_convolution, convolution_configuration); + if (resFFT != VKFFT_SUCCESS) return resFFT; + //Sample forward FFT command buffer allocation + execution performed on kernel. FFT can also be appended to user defined command buffers. + VkFFTLaunchParams launchParams = {}; + resFFT = performVulkanFFT(vkGPU, &app_convolution, &launchParams, -1, 1); + if (resFFT != VKFFT_SUCCESS) return resFFT; + //The kernel has been trasnformed. + + float* buffer_output = (float*)malloc(bufferSize); + if (!buffer_output) return VKFFT_ERROR_MALLOC_FAILED; + //Transfer data from GPU using staging buffer. + resFFT = transferDataToCPU(vkGPU, buffer_output, &buffer, bufferSize); + if (resFFT != VKFFT_SUCCESS) return resFFT; + + //Print data, if needed. + for (uint64_t f = 0; f < convolution_configuration.numberBatches; f++) { + if (file_output) + fprintf(output, "\Batch id: %" PRIu64 "\n\n", f); + printf("\Batch id: %" PRIu64 "\n\n", f); + for (uint64_t v = 0; v < convolution_configuration.coordinateFeatures; v++) { + if (file_output) + fprintf(output, "\ncoordinate: %" PRIu64 "\n\n", v); + printf("\ncoordinate: %" PRIu64 "\n\n", v); + for (uint64_t k = 0; k < convolution_configuration.size[2]; k++) { + for (uint64_t j = 0; j < convolution_configuration.size[1]; j++) { + for (uint64_t i = 0; i < convolution_configuration.size[0]; i++) { + if (file_output) + fprintf(output, "(%.2f, %.2f) ", buffer_output[2 * (i + j * convolution_configuration.size[0] + k * convolution_configuration.size[0] * convolution_configuration.size[1] + v * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] * convolution_configuration.coordinateFeatures)], buffer_output[2 * (i + j * convolution_configuration.size[0] + k * convolution_configuration.size[0] * convolution_configuration.size[1] + v * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] * convolution_configuration.coordinateFeatures) + 1]); + + printf("(%.2f, %.2f) ", buffer_output[2 * (i + j * convolution_configuration.size[0] + k * convolution_configuration.size[0] * convolution_configuration.size[1] + v * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] * convolution_configuration.coordinateFeatures)], buffer_output[2 * (i + j * convolution_configuration.size[0] + k * convolution_configuration.size[0] * convolution_configuration.size[1] + v * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.size[0] * convolution_configuration.size[1] * convolution_configuration.size[2] * convolution_configuration.coordinateFeatures) + 1]); + } + printf("\n"); + } + } + } + } + free(kernel_input); + free(buffer_input); + free(buffer_output); +#if(VKFFT_BACKEND==0) + vkDestroyBuffer(vkGPU->device, buffer, NULL); + vkFreeMemory(vkGPU->device, bufferDeviceMemory, NULL); + vkDestroyBuffer(vkGPU->device, kernel, NULL); + vkFreeMemory(vkGPU->device, kernelDeviceMemory, NULL); +#elif(VKFFT_BACKEND==1) + cudaFree(buffer); + cudaFree(kernel); +#elif(VKFFT_BACKEND==2) + hipFree(buffer); + hipFree(kernel); +#elif(VKFFT_BACKEND==3) + clReleaseMemObject(buffer); + clReleaseMemObject(kernel); +#elif(VKFFT_BACKEND==4) + zeMemFree(vkGPU->context, buffer); + zeMemFree(vkGPU->context, kernel); +#elif(VKFFT_BACKEND==5) + buffer->release(); + kernel->release(); +#endif + deleteVkFFT(&app_kernel); + deleteVkFFT(&app_convolution); + return resFFT; +} diff --git a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h index b5acf055..488bad48 100644 --- a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h +++ b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h @@ -1176,56 +1176,6 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf if (inputLaunchConfiguration.kernelOffset != 0) app->configuration.kernelOffset = inputLaunchConfiguration.kernelOffset; if (inputLaunchConfiguration.specifyOffsetsAtLaunch != 0) app->configuration.specifyOffsetsAtLaunch = inputLaunchConfiguration.specifyOffsetsAtLaunch; //set optional parameters: - pfUINT checkBufferSizeFor64BitAddressing = 0; - for (pfUINT i = 0; i < app->configuration.bufferNum; i++) { - if (app->configuration.bufferSize) - checkBufferSizeFor64BitAddressing += app->configuration.bufferSize[i]; - else { - checkBufferSizeFor64BitAddressing = app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2] * 8; - if (app->configuration.coordinateFeatures > 0) checkBufferSizeFor64BitAddressing *= app->configuration.coordinateFeatures; - if (app->configuration.numberBatches > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberBatches; - if (app->configuration.numberKernels > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberKernels; - if (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) checkBufferSizeFor64BitAddressing *= 2; - if (app->configuration.quadDoubleDoublePrecision) checkBufferSizeFor64BitAddressing *= 4; - } - } -#if(VKFFT_BACKEND==2) - app->configuration.useStrict32BitAddress = 0; - if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1; -#endif - if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1; - checkBufferSizeFor64BitAddressing = 0; - for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) { - if (app->configuration.inputBufferSize) - checkBufferSizeFor64BitAddressing += app->configuration.inputBufferSize[i]; - } -#if(VKFFT_BACKEND==2) - if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1; -#endif - if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1; - - checkBufferSizeFor64BitAddressing = 0; - for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) { - if (app->configuration.outputBufferSize) - checkBufferSizeFor64BitAddressing += app->configuration.outputBufferSize[i]; - } - if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1; - - checkBufferSizeFor64BitAddressing = 0; - for (pfUINT i = 0; i < app->configuration.kernelNum; i++) { - if (app->configuration.kernelSize) - checkBufferSizeFor64BitAddressing += app->configuration.kernelSize[i]; - } -#if(VKFFT_BACKEND==2) - if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1; - // No reason was found to disable strict 32 bit addressing, so enable it - if (app->configuration.useStrict32BitAddress == 0) app->configuration.useStrict32BitAddress = 1; -#endif - if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1; - if (inputLaunchConfiguration.useUint64 != 0) app->configuration.useUint64 = inputLaunchConfiguration.useUint64; -#if(VKFFT_BACKEND==2) - if (inputLaunchConfiguration.useStrict32BitAddress != 0) app->configuration.useStrict32BitAddress = inputLaunchConfiguration.useStrict32BitAddress; -#endif if (inputLaunchConfiguration.maxThreadsNum != 0) app->configuration.maxThreadsNum = inputLaunchConfiguration.maxThreadsNum; if (inputLaunchConfiguration.coalescedMemory != 0) app->configuration.coalescedMemory = inputLaunchConfiguration.coalescedMemory; app->configuration.aimThreads = 128; @@ -1380,7 +1330,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf if (inputLaunchConfiguration.matrixConvolution != 0) app->configuration.matrixConvolution = inputLaunchConfiguration.matrixConvolution; if (inputLaunchConfiguration.numberKernels != 0) app->configuration.numberKernels = inputLaunchConfiguration.numberKernels; - + if (inputLaunchConfiguration.singleKernelMultipleBatches != 0) app->configuration.singleKernelMultipleBatches = inputLaunchConfiguration.singleKernelMultipleBatches; if (inputLaunchConfiguration.symmetricKernel != 0) app->configuration.symmetricKernel = inputLaunchConfiguration.symmetricKernel; if (inputLaunchConfiguration.conjugateConvolution != 0) app->configuration.conjugateConvolution = inputLaunchConfiguration.conjugateConvolution; if (inputLaunchConfiguration.crossPowerSpectrumNormalization != 0) app->configuration.crossPowerSpectrumNormalization = inputLaunchConfiguration.crossPowerSpectrumNormalization; @@ -1391,6 +1341,58 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf app->configuration.registerBoost4Step = 1; if (app->configuration.matrixConvolution > 1) app->configuration.coordinateFeatures = app->configuration.matrixConvolution; } + + pfUINT checkBufferSizeFor64BitAddressing = 0; + for (pfUINT i = 0; i < app->configuration.bufferNum; i++) { + if (app->configuration.bufferSize) + checkBufferSizeFor64BitAddressing += app->configuration.bufferSize[i]; + else { + checkBufferSizeFor64BitAddressing = app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2] * 8; + if (app->configuration.coordinateFeatures > 0) checkBufferSizeFor64BitAddressing *= app->configuration.coordinateFeatures; + if (app->configuration.numberBatches > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberBatches; + if (app->configuration.numberKernels > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberKernels; + if (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) checkBufferSizeFor64BitAddressing *= 2; + if (app->configuration.quadDoubleDoublePrecision) checkBufferSizeFor64BitAddressing *= 4; + } + } +#if(VKFFT_BACKEND==2) + app->configuration.useStrict32BitAddress = 0; + if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1; +#endif + if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1; + checkBufferSizeFor64BitAddressing = 0; + for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) { + if (app->configuration.inputBufferSize) + checkBufferSizeFor64BitAddressing += app->configuration.inputBufferSize[i]; + } +#if(VKFFT_BACKEND==2) + if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1; +#endif + if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1; + + checkBufferSizeFor64BitAddressing = 0; + for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) { + if (app->configuration.outputBufferSize) + checkBufferSizeFor64BitAddressing += app->configuration.outputBufferSize[i]; + } + if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1; + + checkBufferSizeFor64BitAddressing = 0; + for (pfUINT i = 0; i < app->configuration.kernelNum; i++) { + if (app->configuration.kernelSize) + checkBufferSizeFor64BitAddressing += app->configuration.kernelSize[i]; + } +#if(VKFFT_BACKEND==2) + if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1; + // No reason was found to disable strict 32 bit addressing, so enable it + if (app->configuration.useStrict32BitAddress == 0) app->configuration.useStrict32BitAddress = 1; +#endif + if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1; + if (inputLaunchConfiguration.useUint64 != 0) app->configuration.useUint64 = inputLaunchConfiguration.useUint64; +#if(VKFFT_BACKEND==2) + if (inputLaunchConfiguration.useStrict32BitAddress != 0) app->configuration.useStrict32BitAddress = inputLaunchConfiguration.useStrict32BitAddress; +#endif + app->firstAxis = 0; app->lastAxis = app->configuration.FFTdim - 1; for (int i = 0; i < app->configuration.FFTdim; i++) { diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_ReadWrite.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_ReadWrite.h index ce7b580c..d04abc97 100644 --- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_ReadWrite.h +++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel1/vkFFT_ReadWrite.h @@ -381,7 +381,7 @@ static inline void appendKernelOffset(VkFFTSpecializationConstantsLayout* sc, in PfMul(sc, &temp_int, &sc->coordinate, &bufferStride[sc->numFFTdims], 0); PfAdd(sc, &sc->blockInvocationID, &sc->blockInvocationID, &temp_int); } - if ((sc->numBatches.data.i > 1) || (sc->numKernels.data.i > 1)) { + if (((sc->numBatches.data.i > 1) && (!sc->singleKernelMultipleBatches)) || (sc->numKernels.data.i > 1)) { if (sc->convolutionStep && (sc->numKernels.data.i > 1)) { PfMul(sc, &sc->tempInt, &sc->batchID, &sc->inputStride[sc->numFFTdims+1], 0); PfAdd(sc, &sc->blockInvocationID, &sc->blockInvocationID, &sc->tempInt); diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h index e90907b7..464cf60c 100644 --- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h +++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_FFT.h @@ -474,6 +474,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : (int)app->configuration.coordinateFeatures; axis->specializationConstants.matrixConvolution = (int)app->configuration.matrixConvolution; + axis->specializationConstants.singleKernelMultipleBatches = (int)app->configuration.singleKernelMultipleBatches; axis->specializationConstants.coordinate.type = 31; axis->specializationConstants.coordinate.data.i = 0; axis->specializationConstants.batchID.type = 31; diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h index f5b5e8e0..43e8a17a 100644 --- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h +++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h @@ -195,6 +195,8 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication* axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : (int)app->configuration.coordinateFeatures; axis->specializationConstants.matrixConvolution = (int)app->configuration.matrixConvolution; + axis->specializationConstants.singleKernelMultipleBatches = (int)app->configuration.singleKernelMultipleBatches; + for (pfUINT i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) { axis->specializationConstants.size[i].type = 31; axis->specializationConstants.size[i].data.i = (pfINT)app->configuration.size[i]; diff --git a/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h b/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h index 28178cf3..67ed7d7c 100644 --- a/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h +++ b/vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h @@ -274,6 +274,7 @@ typedef struct { pfUINT matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures pfUINT symmetricKernel; //specify if kernel in 2x2 or 3x3 matrix convolution is symmetric pfUINT numberKernels;// N - only used in convolution step - specify how many kernels were initialized before. Expands one input to multiple (batched) output + pfUINT singleKernelMultipleBatches;// 0 off, 1 - perform convolution with one kernel to multiple (numberBatches) input/output. kernel can still use multiple coordinates for batching (for example if you want to have 3 kernels cycling for 9 systems). Default 0 pfUINT kernelConvolution;// specify if this application is used to create kernel for convolution, so it has the same properties. performConvolution has to be set to 0 for kernel creation //register overutilization (experimental): (default 0 if not stated otherwise) @@ -802,6 +803,7 @@ typedef struct { pfUINT kernelBlockSize; int numCoordinates; int matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures + int singleKernelMultipleBatches; PfContainer numBatches; PfContainer numKernels; int conjugateConvolution;