add example

NVIDIA · Jan 12, 2025 · 865ef42 · 865ef42
1 parent f402bba
commit 865ef42
Show file tree

Hide file tree

Showing 2 changed files with 152 additions and 0 deletions.
diff --git a/cudax/examples/file.cu b/cudax/examples/file.cu
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Vector addition: C = A + B.
+ *
+ * This sample is a very basic sample that implements element by element
+ * vector addition. It is the same as the sample illustrating Chapter 2
+ * of the programming guide with some additions like error checking.
+ */
+
+#include <stdio.h>
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+#include <cuda/std/span>
+
+#include <cuda/experimental/file.cuh>
+#include <cuda/experimental/launch.cuh>
+#include <cuda/experimental/stream.cuh>
+
+#include "vector.cuh"
+
+namespace cudax = cuda::experimental;
+using cudax::in;
+using cudax::out;
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void vectorAdd(cudax::span<const float> A, cudax::span<const float> B, cudax::span<float> C)
+{
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < A.size())
+  {
+    C[i] = A[i] + B[i] + 0.0f;
+  }
+}
+
+/**
+ * Host main routine
+ */
+int main(int argc, const char* argv[])
+try
+{
+  // Create a file for writing
+  cudax::file outfile{argv[1], "w"};
+
+  // A CUDA stream on which to execute the vector addition kernel
+  cudax::stream stream(cudax::devices[0]);
+
+  // Register the stream with the file
+  cudax::file_stream_registerer file_stream_registerer(stream);
+
+  // Print the vector length to be used, and compute its size
+  int numElements = 50000;
+  printf("[Vector addition of %d elements]\n", numElements);
+
+  // Allocate the host vectors
+  cudax::vector<float> A(numElements); // input
+  cudax::vector<float> B(numElements); // input
+  cudax::vector<float> C(numElements); // output
+
+  const cuda::std::size_t numBytes = numElements * sizeof(float);
+
+  // Register the output vector for RDMA
+  cudax::file_buffer_registerer file_buffer_reg{&C[0], numBytes, cudax::file_buffer_flags::rdma_register};
+
+  // Initialize the host input vectors
+  for (int i = 0; i < numElements; ++i)
+  {
+    A[i] = rand() / (float) RAND_MAX;
+    B[i] = rand() / (float) RAND_MAX;
+  }
+
+  // Define the kernel launch parameters
+  constexpr int threadsPerBlock = 256;
+  auto config                   = cudax::distribute<threadsPerBlock>(numElements);
+
+  // Launch the vectorAdd kernel
+  printf(
+    "CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cudax::block, cudax::grid), threadsPerBlock);
+  cudax::launch(stream, config, vectorAdd, in(A), in(B), out(C));
+
+  ssize_t numBytesWritten{};
+
+  // launch the file write
+  outfile.write_async(stream, &C[0], &numBytes, &numBytesWritten);
+
+  printf("waiting for the stream to finish\n");
+  stream.wait();
+
+  if (numBytesWritten != numBytes)
+  {
+    fprintf(stderr, "Error: wrote %ld bytes, expected %ld\n", numBytesWritten, numBytes);
+    exit(EXIT_FAILURE);
+  }
+
+  printf("verifying the results\n");
+  // Verify that the result vector is correct
+  for (int i = 0; i < numElements; ++i)
+  {
+    if (fabs(A[i] + B[i] - C[i]) > 1e-5)
+    {
+      fprintf(stderr, "Result verification failed at element %d!\n", i);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  printf("Test PASSED\n");
+
+  printf("Done\n");
+  return 0;
+}
+catch (const std::exception& e)
+{
+  printf("caught an exception: \"%s\"\n", e.what());
+}
+catch (...)
+{
+  printf("caught an unknown exception\n");
+}
diff --git a/cudax/include/cuda/experimental/file.cuh b/cudax/include/cuda/experimental/file.cuh
@@ -27,5 +27,6 @@
 
 #include <cuda/experimental/__file/file.cuh>
 #include <cuda/experimental/__file/file_driver.cuh>
+#include <cuda/experimental/__file/utils.cuh>
 
 #endif // __CUDAX_FILE___