From 81b1af6b1c29c435d4db0fcef2ce8ae63f4ac46d Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Mon, 27 Jan 2025 23:53:43 -0700
Subject: [PATCH 1/3] cudastf (examples): Fix compiler errors when enabling
 examples for CUDA STF (#3516)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Make index types consistent in loops
* Add implementation of missing operator

Co-authored-by: Jan Ciesko <jciesko@gmail.com>
Co-authored-by: Cédric Augonnet <158148890+caugonnet@users.noreply.github.com>
---
 .../examples/stf/linear_algebra/06-pdgemm.cu  | 20 ++--
 .../stf/linear_algebra/07-cholesky.cu         | 42 ++++----
 cudax/examples/stf/linear_algebra/07-potri.cu | 96 +++++++++----------
 cudax/examples/stf/linear_algebra/cg_csr.cu   |  8 ++
 .../stf/linear_algebra/cg_dense_2D.cu         | 18 ++--
 cudax/examples/stf/linear_algebra/strassen.cu |  4 +-
 6 files changed, 98 insertions(+), 90 deletions(-)
diff --git a/cudax/examples/stf/linear_algebra/06-pdgemm.cu b/cudax/examples/stf/linear_algebra/06-pdgemm.cu
index 9df5bc3c260..ec131946adb 100644
--- a/cudax/examples/stf/linear_algebra/06-pdgemm.cu
+++ b/cudax/examples/stf/linear_algebra/06-pdgemm.cu
@@ -160,9 +160,9 @@ public:
   {
     nvtxRangePushA("FILL");
     // Fill blocks by blocks
-    for (int colb = 0; colb < nt; colb++)
+    for (size_t colb = 0; colb < nt; colb++)
     {
-      for (int rowb = 0; rowb < mt; rowb++)
+      for (size_t rowb = 0; rowb < mt; rowb++)
       {
         // Each task fills a block
         auto& h   = get_handle(rowb, colb);
@@ -251,9 +251,9 @@ void PDGEMM(stream_ctx& ctx,
             double beta,
             matrix<double>& C)
 {
-  for (int m = 0; m < C.mt; m++)
+  for (size_t m = 0; m < C.mt; m++)
   {
-    for (int n = 0; n < C.nt; n++)
+    for (size_t n = 0; n < C.nt; n++)
     {
       //=========================================
       // alpha*A*B does not contribute; scale C
@@ -271,7 +271,7 @@ void PDGEMM(stream_ctx& ctx,
         if (transb == CUBLAS_OP_N)
         {
           assert(A.nt == B.mt);
-          for (int k = 0; k < A.nt; k++)
+          for (size_t k = 0; k < A.nt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n);
@@ -282,7 +282,7 @@ void PDGEMM(stream_ctx& ctx,
         //=====================================
         else
         {
-          for (int k = 0; k < A.nt; k++)
+          for (size_t k = 0; k < A.nt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n);
@@ -296,7 +296,7 @@ void PDGEMM(stream_ctx& ctx,
         //=====================================
         if (transb == CUBLAS_OP_N)
         {
-          for (int k = 0; k < A.mt; k++)
+          for (size_t k = 0; k < A.mt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n);
@@ -307,7 +307,7 @@ void PDGEMM(stream_ctx& ctx,
         //==========================================
         else
         {
-          for (int k = 0; k < A.mt; k++)
+          for (size_t k = 0; k < A.mt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n);
@@ -328,14 +328,14 @@ void run(stream_ctx& ctx, size_t N, size_t NB)
   cuda_safe_call(cudaGetDeviceCount(&ndevs));
 
   /* Warm up allocators */
-  for (size_t d = 0; d < ndevs; d++)
+  for (int d = 0; d < ndevs; d++)
   {
     auto lX = ctx.logical_data(shape_of<slice<double>>(1));
     ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {};
   }
 
   /* Initializes CUBLAS on all devices */
-  for (size_t d = 0; d < ndevs; d++)
+  for (int d = 0; d < ndevs; d++)
   {
     cuda_safe_call(cudaSetDevice(d));
     get_cublas_handle();
diff --git a/cudax/examples/stf/linear_algebra/07-cholesky.cu b/cudax/examples/stf/linear_algebra/07-cholesky.cu
index 02d90fdf74b..5c7947fc5ac 100644
--- a/cudax/examples/stf/linear_algebra/07-cholesky.cu
+++ b/cudax/examples/stf/linear_algebra/07-cholesky.cu
@@ -91,10 +91,10 @@ public:
 
     handles.resize(mt * nt);
 
-    for (int colb = 0; colb < nt; colb++)
+    for (size_t colb = 0; colb < nt; colb++)
     {
       int low_rowb = sym_matrix ? colb : 0;
-      for (int rowb = low_rowb; rowb < mt; rowb++)
+      for (size_t rowb = low_rowb; rowb < mt; rowb++)
       {
         T* addr_h = get_block_h(rowb, colb);
         auto& h   = handle(rowb, colb);
@@ -171,10 +171,10 @@ public:
   {
     nvtxRangePushA("FILL");
     // Fill blocks by blocks
-    for (int colb = 0; colb < nt; colb++)
+    for (size_t colb = 0; colb < nt; colb++)
     {
       int low_rowb = sym_matrix ? colb : 0;
-      for (int rowb = low_rowb; rowb < mt; rowb++)
+      for (size_t rowb = low_rowb; rowb < mt; rowb++)
       {
         // Each task fills a block
         auto& h   = handle(rowb, colb);
@@ -363,9 +363,9 @@ void PDNRM2_HOST(matrix<double>* A, double* result)
   reserved::dot::set_current_color("red");
 #endif
 
-  for (int rowb = 0; rowb < A->mt; rowb++)
+  for (size_t rowb = 0; rowb < A->mt; rowb++)
   {
-    for (int colb = 0; colb < A->nt; colb++)
+    for (size_t colb = 0; colb < A->nt; colb++)
     {
       ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) {
         double res2 = 0.0;
@@ -452,17 +452,17 @@ void PDTRSM(cublasSideMode_t side,
       //===========================================
       if (trans == CUBLAS_OP_N)
       {
-        for (int k = 0; k < B.mt; k++)
+        for (size_t k = 0; k < B.mt; k++)
         {
           double lalpha = k == 0 ? alpha : 1.0;
-          for (int n = 0; n < B.nt; n++)
+          for (size_t n = 0; n < B.nt; n++)
           {
             cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k)));
             DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n);
           }
-          for (int m = k + 1; m < B.mt; m++)
+          for (size_t m = k + 1; m < B.mt; m++)
           {
-            for (int n = 0; n < B.nt; n++)
+            for (size_t n = 0; n < B.nt; n++)
             {
               cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k)));
               DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n);
@@ -475,17 +475,17 @@ void PDTRSM(cublasSideMode_t side,
       //================================================
       else
       {
-        for (int k = 0; k < B.mt; k++)
+        for (size_t k = 0; k < B.mt; k++)
         {
           double lalpha = k == 0 ? alpha : 1.0;
-          for (int n = 0; n < B.nt; n++)
+          for (size_t n = 0; n < B.nt; n++)
           {
             cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1)));
             DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n);
           }
-          for (int m = k + 1; m < B.mt; m++)
+          for (size_t m = k + 1; m < B.mt; m++)
           {
-            for (int n = 0; n < B.nt; n++)
+            for (size_t n = 0; n < B.nt; n++)
             {
               cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m)));
               DGEMM(
@@ -543,9 +543,9 @@ void PDGEMM(cublasOperation_t transa,
   reserved::dot::set_current_color("blue");
 #endif
 
-  for (int m = 0; m < C.mt; m++)
+  for (size_t m = 0; m < C.mt; m++)
   {
-    for (int n = 0; n < C.nt; n++)
+    for (size_t n = 0; n < C.nt; n++)
     {
       //=========================================
       // alpha*A*B does not contribute; scale C
@@ -562,7 +562,7 @@ void PDGEMM(cublasOperation_t transa,
         //================================
         if (transb == CUBLAS_OP_N)
         {
-          for (int k = 0; k < A.nt; k++)
+          for (size_t k = 0; k < A.nt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n);
@@ -573,7 +573,7 @@ void PDGEMM(cublasOperation_t transa,
         //=====================================
         else
         {
-          for (int k = 0; k < A.nt; k++)
+          for (size_t k = 0; k < A.nt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n);
@@ -587,7 +587,7 @@ void PDGEMM(cublasOperation_t transa,
         //=====================================
         if (transb == CUBLAS_OP_N)
         {
-          for (int k = 0; k < A.mt; k++)
+          for (size_t k = 0; k < A.mt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n);
@@ -598,7 +598,7 @@ void PDGEMM(cublasOperation_t transa,
         //==========================================
         else
         {
-          for (int k = 0; k < A.mt; k++)
+          for (size_t k = 0; k < A.mt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n);
@@ -640,7 +640,7 @@ int main(int argc, char** argv)
   int ndevs;
   cuda_safe_call(cudaGetDeviceCount(&ndevs));
 
-  for (size_t d = 0; d < ndevs; d++)
+  for (int d = 0; d < ndevs; d++)
   {
     auto lX = ctx.logical_data(shape_of<slice<double>>(1));
     ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {};
diff --git a/cudax/examples/stf/linear_algebra/07-potri.cu b/cudax/examples/stf/linear_algebra/07-potri.cu
index b17dead1219..e80fbffa663 100644
--- a/cudax/examples/stf/linear_algebra/07-potri.cu
+++ b/cudax/examples/stf/linear_algebra/07-potri.cu
@@ -93,10 +93,10 @@ public:
 
     handles.resize(mt * nt);
 
-    for (int colb = 0; colb < nt; colb++)
+    for (size_t colb = 0; colb < nt; colb++)
     {
-      int low_rowb = sym_matrix ? colb : 0;
-      for (int rowb = low_rowb; rowb < mt; rowb++)
+      size_t low_rowb = sym_matrix ? colb : 0;
+      for (size_t rowb = low_rowb; rowb < mt; rowb++)
       {
         T* addr_h = get_block_h(rowb, colb);
         auto& h   = get_handle(rowb, colb);
@@ -173,10 +173,10 @@ public:
   {
     nvtxRangePushA("FILL");
     // Fill blocks by blocks
-    for (int colb = 0; colb < nt; colb++)
+    for (size_t colb = 0; colb < nt; colb++)
     {
       int low_rowb = sym_matrix ? colb : 0;
-      for (int rowb = low_rowb; rowb < mt; rowb++)
+      for (size_t rowb = low_rowb; rowb < mt; rowb++)
       {
         // Each task fills a block
         auto& h   = get_handle(rowb, colb);
@@ -804,9 +804,9 @@ void PDNRM2_HOST(matrix<double>* A, double* result)
   ctx.get_dot()->set_current_color("red");
 #endif
 
-  for (int rowb = 0; rowb < A->mt; rowb++)
+  for (size_t rowb = 0; rowb < A->mt; rowb++)
   {
-    for (int colb = 0; colb < A->nt; colb++)
+    for (size_t colb = 0; colb < A->nt; colb++)
     {
       ctx.host_launch(A->get_handle(rowb, colb).read())->*[=](auto sA) {
         double res2 = 0.0;
@@ -888,17 +888,17 @@ void PDTRSM(cublasSideMode_t side,
       //===========================================
       if (trans == CUBLAS_OP_N)
       {
-        for (int k = 0; k < B.mt; k++)
+        for (size_t k = 0; k < B.mt; k++)
         {
           double lalpha = k == 0 ? alpha : 1.0;
-          for (int n = 0; n < B.nt; n++)
+          for (size_t n = 0; n < B.nt; n++)
           {
             cuda_try(cudaSetDevice(A.get_preferred_devid(k, k)));
             DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n);
           }
-          for (int m = k + 1; m < B.mt; m++)
+          for (size_t m = k + 1; m < B.mt; m++)
           {
-            for (int n = 0; n < B.nt; n++)
+            for (size_t n = 0; n < B.nt; n++)
             {
               cuda_try(cudaSetDevice(A.get_preferred_devid(m, k)));
               DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n);
@@ -911,17 +911,17 @@ void PDTRSM(cublasSideMode_t side,
       //================================================
       else
       {
-        for (int k = 0; k < B.mt; k++)
+        for (size_t k = 0; k < B.mt; k++)
         {
           double lalpha = k == 0 ? alpha : 1.0;
-          for (int n = 0; n < B.nt; n++)
+          for (size_t n = 0; n < B.nt; n++)
           {
             cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1)));
             DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n);
           }
-          for (int m = k + 1; m < B.mt; m++)
+          for (size_t m = k + 1; m < B.mt; m++)
           {
-            for (int n = 0; n < B.nt; n++)
+            for (size_t n = 0; n < B.nt; n++)
             {
               cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m)));
               DGEMM(
@@ -983,9 +983,9 @@ void PDGEMM(cublasOperation_t transa,
   reserved::dot::set_current_color("blue");
 #endif
 
-  for (int m = 0; m < C.mt; m++)
+  for (size_t m = 0; m < C.mt; m++)
   {
-    for (int n = 0; n < C.nt; n++)
+    for (size_t n = 0; n < C.nt; n++)
     {
       cuda_try(cudaSetDevice(C.get_preferred_devid(m, n)));
 
@@ -1005,7 +1005,7 @@ void PDGEMM(cublasOperation_t transa,
         if (transb == CUBLAS_OP_N)
         {
           assert(A.nt == B.mt);
-          for (int k = 0; k < A.nt; k++)
+          for (size_t k = 0; k < A.nt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n);
@@ -1016,7 +1016,7 @@ void PDGEMM(cublasOperation_t transa,
         //=====================================
         else
         {
-          for (int k = 0; k < A.nt; k++)
+          for (size_t k = 0; k < A.nt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n);
@@ -1030,7 +1030,7 @@ void PDGEMM(cublasOperation_t transa,
         //=====================================
         if (transb == CUBLAS_OP_N)
         {
-          for (int k = 0; k < A.mt; k++)
+          for (size_t k = 0; k < A.mt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n);
@@ -1041,7 +1041,7 @@ void PDGEMM(cublasOperation_t transa,
         //==========================================
         else
         {
-          for (int k = 0; k < A.mt; k++)
+          for (size_t k = 0; k < A.mt; k++)
           {
             double zbeta = k == 0 ? beta : 1.0;
             DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n);
@@ -1062,22 +1062,22 @@ void PDTRTRI(matrix<double>& A, cublasFillMode_t uplo, cublasDiagType_t diag)
 
   nvtxRangePushA("SUBMIT_PDTRTRI");
 
-  for (int k = 0; k < A.nt; k++)
+  for (size_t k = 0; k < A.nt; k++)
   {
-    for (int m = k + 1; m < A.mt; m++)
+    for (size_t m = k + 1; m < A.mt; m++)
     {
       cuda_try(cudaSetDevice(A.get_preferred_devid(m, k)));
       DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, -1.0, A, k, k, A, m, k);
     }
-    for (int m = k + 1; m < A.mt; m++)
+    for (size_t m = k + 1; m < A.mt; m++)
     {
-      for (int n = 0; n < k; n++)
+      for (size_t n = 0; n < k; n++)
       {
         cuda_try(cudaSetDevice(A.get_preferred_devid(m, n)));
         DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, A, m, k, A, k, n, 1.0, A, m, n);
       }
     }
-    for (int n = 0; n < k; n++)
+    for (size_t n = 0; n < k; n++)
     {
       cuda_try(cudaSetDevice(A.get_preferred_devid(k, n)));
       DTRSM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, 1.0, A, k, k, A, k, n);
@@ -1101,20 +1101,20 @@ void PDLAUUM(matrix<double>& A, cublasFillMode_t uplo)
 
   nvtxRangePushA("SUBMIT_PDLAUUM");
 
-  for (int k = 0; k < A.mt; k++)
+  for (size_t k = 0; k < A.mt; k++)
   {
-    for (int n = 0; n < k; n++)
+    for (size_t n = 0; n < k; n++)
     {
       cuda_try(cudaSetDevice(A.get_preferred_devid(n, n)));
       DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, 1.0, A, k, n, 1.0, A, n, n);
 
-      for (int m = n + 1; m < k; m++)
+      for (size_t m = n + 1; m < k; m++)
       {
         cuda_try(cudaSetDevice(A.get_preferred_devid(m, n)));
         DGEMM(CUBLAS_OP_T, CUBLAS_OP_N, 1.0, A, k, m, A, k, n, 1.0, A, m, n);
       }
     }
-    for (int n = 0; n < k; n++)
+    for (size_t n = 0; n < k; n++)
     {
       cuda_try(cudaSetDevice(A.get_preferred_devid(k, n)));
       DTRMM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, k, k, A, k, n);
@@ -1136,7 +1136,7 @@ void PDSYMM(cublasSideMode_t side,
             double beta,
             matrix<double>& C)
 {
-  int k, m, n;
+  size_t k, m, n;
   double zbeta;
   double zone = (double) 1.0;
 
@@ -1272,15 +1272,15 @@ void PDTRMM(cublasSideMode_t side,
       //===========================================
       if (trans == CUBLAS_OP_N)
       {
-        for (int m = 0; m < B.mt; m++)
+        for (size_t m = 0; m < B.mt; m++)
         {
-          for (int n = 0; n < B.nt; n++)
+          for (size_t n = 0; n < B.nt; n++)
           {
             cuda_try(cudaSetDevice(B.get_preferred_devid(m, n)));
 
             DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n);
 
-            for (int k = m + 1; k < A.mt; k++)
+            for (size_t k = m + 1; k < A.mt; k++)
             {
               DGEMM(trans, CUBLAS_OP_N, alpha, A, m, k, B, k, n, 1.0, B, m, n);
             }
@@ -1294,7 +1294,7 @@ void PDTRMM(cublasSideMode_t side,
       {
         for (int m = B.mt - 1; m > -1; m--)
         {
-          for (int n = 0; n < B.nt; n++)
+          for (size_t n = 0; n < B.nt; n++)
           {
             cuda_try(cudaSetDevice(B.get_preferred_devid(m, n)));
 
@@ -1317,7 +1317,7 @@ void PDTRMM(cublasSideMode_t side,
       {
         for (int m = B.mt - 1; m > -1; m--)
         {
-          for (int n = 0; n < B.nt; n++)
+          for (size_t n = 0; n < B.nt; n++)
           {
             cuda_try(cudaSetDevice(B.get_preferred_devid(m, n)));
 
@@ -1335,13 +1335,13 @@ void PDTRMM(cublasSideMode_t side,
       //================================================
       else
       {
-        for (int m = 0; m < B.mt; m++)
+        for (size_t m = 0; m < B.mt; m++)
         {
-          for (int n = 0; n < B.nt; n++)
+          for (size_t n = 0; n < B.nt; n++)
           {
             DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n);
 
-            for (int k = m + 1; k < A.mt; k++)
+            for (size_t k = m + 1; k < A.mt; k++)
             {
               DGEMM(trans, CUBLAS_OP_N, alpha, A, k, m, B, k, n, 1.0, B, m, n);
             }
@@ -1361,7 +1361,7 @@ void PDTRMM(cublasSideMode_t side,
       {
         for (int n = B.nt - 1; n > -1; n--)
         {
-          for (int m = 0; m < B.mt; m++)
+          for (size_t m = 0; m < B.mt; m++)
           {
             cuda_try(cudaSetDevice(B.get_preferred_devid(m, n)));
 
@@ -1379,15 +1379,15 @@ void PDTRMM(cublasSideMode_t side,
       //=================================================
       else
       {
-        for (int n = 0; n < B.nt; n++)
+        for (size_t n = 0; n < B.nt; n++)
         {
-          for (int m = 0; m < B.mt; m++)
+          for (size_t m = 0; m < B.mt; m++)
           {
             cuda_try(cudaSetDevice(B.get_preferred_devid(m, n)));
 
             DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n);
 
-            for (int k = n + 1; k < A.mt; k++)
+            for (size_t k = n + 1; k < A.mt; k++)
             {
               DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, n, k, 1.0, B, m, n);
             }
@@ -1402,15 +1402,15 @@ void PDTRMM(cublasSideMode_t side,
       //============================================
       if (trans == CUBLAS_OP_N)
       {
-        for (int n = 0; n < B.nt; n++)
+        for (size_t n = 0; n < B.nt; n++)
         {
-          for (int m = 0; m < B.mt; m++)
+          for (size_t m = 0; m < B.mt; m++)
           {
             cuda_try(cudaSetDevice(B.get_preferred_devid(m, n)));
 
             DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n);
 
-            for (int k = n + 1; k < A.mt; k++)
+            for (size_t k = n + 1; k < A.mt; k++)
             {
               DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, k, n, 1.0, B, m, n);
             }
@@ -1424,7 +1424,7 @@ void PDTRMM(cublasSideMode_t side,
       {
         for (int n = B.nt - 1; n > -1; n--)
         {
-          for (int m = 0; m < B.mt; m++)
+          for (size_t m = 0; m < B.mt; m++)
           {
             cuda_try(cudaSetDevice(B.get_preferred_devid(m, n)));
 
@@ -1462,7 +1462,7 @@ void run(int N, int NB)
   int ndevs;
   cuda_try(cudaGetDeviceCount(&ndevs));
 
-  for (size_t d = 0; d < ndevs; d++)
+  for (int d = 0; d < ndevs; d++)
   {
     auto ldummy = ctx.logical_data(shape_of<slice<char>>(1));
     ctx.task(exec_place::device(d), ldummy.write())->*[](cudaStream_t, auto) {
diff --git a/cudax/examples/stf/linear_algebra/cg_csr.cu b/cudax/examples/stf/linear_algebra/cg_csr.cu
index 8d99f237050..ec0d3f1c9ff 100644
--- a/cudax/examples/stf/linear_algebra/cg_csr.cu
+++ b/cudax/examples/stf/linear_algebra/cg_csr.cu
@@ -100,6 +100,14 @@ public:
     };
   }
 
+  // Assign constructor
+  scalar& operator=(scalar const& rhs)
+  {
+    ctx    = rhs.ctx;
+    handle = ctx.logical_data(rhs.handle.shape());
+    return *this;
+  }
+
   // Copy constructor
   scalar(const scalar& a)
   {
diff --git a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu
index ed2840dd00f..8bd2f957380 100644
--- a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu
+++ b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu
@@ -65,7 +65,7 @@ public:
     if (is_tmp)
     {
       // There is no physical backing for this temporary vector
-      for (int b = 0; b < nblocks; b++)
+      for (size_t b = 0; b < nblocks; b++)
       {
         size_t bs  = std::min(N - block_size * b, block_size);
         handles[b] = to_shared(ctx.logical_data(shape_of<slice<double>>(bs)));
@@ -91,7 +91,7 @@ public:
   {
     handles.resize(nblocks);
 
-    for (int b = 0; b < nblocks; b++)
+    for (size_t b = 0; b < nblocks; b++)
     {
       size_t bs  = std::min(N - block_size * b, block_size);
       handles[b] = to_shared(ctx.logical_data(shape_of<slice<double>>(bs)));
@@ -107,12 +107,12 @@ public:
   void fill(const std::function<double(int)>& f)
   {
     size_t bs = block_size;
-    for (int b = 0; b < nblocks; b++)
+    for (size_t b = 0; b < nblocks; b++)
     {
       ctx.task(exec_place::host, handles[b]->write())->*[&f, b, bs](cudaStream_t stream, auto ds) {
         cuda_safe_call(cudaStreamSynchronize(stream));
 
-        for (int local_row = 0; local_row < ds.extent(0); local_row++)
+        for (size_t local_row = 0; local_row < ds.extent(0); local_row++)
         {
           ds(local_row) = f(local_row + b * bs);
         }
@@ -234,7 +234,7 @@ class scalar DOT(vector& a, class vector& b)
   scalar global_res(true);
 
   // Loop over all blocks,
-  for (int bid = 0; bid < a.nblocks; bid++)
+  for (size_t bid = 0; bid < a.nblocks; bid++)
   {
     scalar res(true);
 
@@ -267,7 +267,7 @@ void AXPY(const class scalar& alpha, class vector& x, class vector& y)
   assert(x.N == y.N);
   assert(x.nblocks == y.nblocks);
 
-  for (int b = 0; b < x.nblocks; b++)
+  for (size_t b = 0; b < x.nblocks; b++)
   {
     ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw())
         ->*
@@ -286,7 +286,7 @@ void SCALE_AXPY(const scalar& alpha, const class vector& x, class vector& y)
   assert(x.N == y.N);
   assert(x.nblocks == y.nblocks);
 
-  for (int b = 0; b < x.nblocks; b++)
+  for (size_t b = 0; b < x.nblocks; b++)
   {
     ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw())
         ->*[](cudaStream_t stream, auto dalpha, auto dx, auto dy) {
@@ -315,9 +315,9 @@ void GEMV(double alpha, class matrix& a, class vector& x, double beta, class vec
   size_t block_size = x.block_size;
   assert(block_size == y.block_size);
 
-  for (int row_y = 0; row_y < y.nblocks; row_y++)
+  for (size_t row_y = 0; row_y < y.nblocks; row_y++)
   {
-    for (int row_x = 0; row_x < x.nblocks; row_x++)
+    for (size_t row_x = 0; row_x < x.nblocks; row_x++)
     {
       double local_beta = (row_x == 0) ? beta : 1.0;
 
diff --git a/cudax/examples/stf/linear_algebra/strassen.cu b/cudax/examples/stf/linear_algebra/strassen.cu
index 0ceaa26a422..b28167cfcf8 100644
--- a/cudax/examples/stf/linear_algebra/strassen.cu
+++ b/cudax/examples/stf/linear_algebra/strassen.cu
@@ -417,9 +417,9 @@ void strassen_test(context& ctx, size_t N)
   cuda_safe_call(cudaHostRegister(B, N * N * sizeof(double), cudaHostRegisterPortable));
   cuda_safe_call(cudaHostRegister(C, N * N * sizeof(double), cudaHostRegisterPortable));
 
-  for (int col = 0; col < N; col++)
+  for (size_t col = 0; col < N; col++)
   {
-    for (int row = 0; row < N; row++)
+    for (size_t row = 0; row < N; row++)
     {
       A[row + N * col] = 1.0;
       B[row + N * col] = -1.0;

From cdd94ad2159811cdbb98b10796fb5c1c57a3ab15 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:24:08 -0800
Subject: [PATCH 2/3] A few improvements for internal macro documentation
 (#3554)

* add MVSV 2017 deprecation
* add ARM64 emulation note
* Add behavior of NVRTC with OS macros
* add FP8 identification macro
---
 docs/cccl_development/macro.rst | 113 ++++++++++++++++++--------------
 docs/cpp.rst                    |   2 +-
 2 files changed, 65 insertions(+), 50 deletions(-)

diff --git a/docs/cccl_development/macro.rst b/docs/cccl_development/macro.rst
index 30de6aa8b10..5cc39f0f0d3 100644
--- a/docs/cccl_development/macro.rst
+++ b/docs/cccl_development/macro.rst
@@ -12,21 +12,21 @@ Compiler Macros
 
 **Host compiler macros**:
 
-+------------------------------+--------------------------------+
-| ``_CCCL_COMPILER(CLANG)``    | Clang                          |
-+------------------------------+--------------------------------+
-| ``_CCCL_COMPILER(GCC)``      | GCC                            |
-+------------------------------+--------------------------------+
-| ``_CCCL_COMPILER(NVHPC)``    | Nvidia HPC compiler            |
-+------------------------------+--------------------------------+
-| ``_CCCL_COMPILER(MSVC)``     | Microsoft Visual Studio        |
-+------------------------------+--------------------------------+
-| ``_CCCL_COMPILER(MSVC2017)`` | Microsoft Visual Studio 2017   |
-+------------------------------+--------------------------------+
-| ``_CCCL_COMPILER(MSVC2019)`` | Microsoft Visual Studio 2019   |
-+------------------------------+--------------------------------+
-| ``_CCCL_COMPILER(MSVC2022)`` | Microsoft Visual Studio 2022   |
-+------------------------------+--------------------------------+
++------------------------------+---------------------------------------------+
+| ``_CCCL_COMPILER(CLANG)``    | Clang                                       |
++------------------------------+---------------------------------------------+
+| ``_CCCL_COMPILER(GCC)``      | GCC                                         |
++------------------------------+---------------------------------------------+
+| ``_CCCL_COMPILER(NVHPC)``    | Nvidia HPC compiler                         |
++------------------------------+---------------------------------------------+
+| ``_CCCL_COMPILER(MSVC)``     | Microsoft Visual Studio                     |
++------------------------------+---------------------------------------------+
+| ``_CCCL_COMPILER(MSVC2017)`` | Microsoft Visual Studio 2017 (deprecated)   |
++------------------------------+---------------------------------------------+
+| ``_CCCL_COMPILER(MSVC2019)`` | Microsoft Visual Studio 2019                |
++------------------------------+---------------------------------------------+
+| ``_CCCL_COMPILER(MSVC2022)`` | Microsoft Visual Studio 2022                |
++------------------------------+---------------------------------------------+
 
 The ``_CCCL_COMPILER`` function-like macro can also be used to check the version of a compiler.
 
@@ -68,11 +68,11 @@ The ``_CCCL_CUDA_COMPILER`` function-like macro can also be used to check the ve
 
 **PTX macros**:
 
-+-------------------------+-------------------------------------------------------------------------------------------------------------------+
-| ``_CCCL_PTX_ARCH``      | Alias of ``__CUDA_ARCH__`` with value equal to 0 if cuda compiler is not available                                |
-+-------------------------+-------------------------------------------------------------------------------------------------------------------+
-| ``__cccl_ptx_isa``      | PTX ISA version available with the current CUDA compiler, e.g. PTX ISA 8.4 (``840``) is available from CUDA 12.4  |
-+-------------------------+-------------------------------------------------------------------------------------------------------------------+
++--------------------+-------------------------------------------------------------------------------------------------------------------+
+| ``_CCCL_PTX_ARCH`` | Alias of ``__CUDA_ARCH__`` with value equal to 0 if cuda compiler is not available                                |
++--------------------+-------------------------------------------------------------------------------------------------------------------+
+| ``__cccl_ptx_isa`` | PTX ISA version available with the current CUDA compiler, e.g. PTX ISA 8.4 (``840``) is available from CUDA 12.4  |
++--------------------+-------------------------------------------------------------------------------------------------------------------+
 
 ----
 
@@ -81,26 +81,26 @@ Architecture Macros
 
 The following macros are used to check the target architecture. They comply with the compiler supported by the CUDA toolkit. Compilers outside the CUDA toolkit may define such macros in a different way.
 
-+-------------------------+-------------------------------------+
-| ``_CCCL_ARCH(ARM64)``   |  ARM 64-bit                         |
-+-------------------------+-------------------------------------+
-| ``_CCCL_ARCH(X86_64)``  |  X86 64-bit                         |
-+-------------------------+-------------------------------------+
++-------------------------+---------------------------------------------------+
+| ``_CCCL_ARCH(ARM64)``   |  ARM 64-bit, including MSVC emulation             |
++-------------------------+---------------------------------------------------+
+| ``_CCCL_ARCH(X86_64)``  |  X86 64-bit. False on ARM 64-bit MSVC emulation   |
++-------------------------+---------------------------------------------------+
 
 ----
 
 OS Macros
 ---------
 
-+-----------------------+---------+
-| ``_CCCL_OS(WINDOWS)`` | Windows |
-+-----------------------+---------+
-| ``_CCCL_OS(LINUX)``   | Linux   |
-+-----------------------+---------+
-| ``_CCCL_OS(ANDROID)`` | Android |
-+-----------------------+---------+
-| ``_CCCL_OS(QNX)``     | QNX     |
-+-----------------------+---------+
++-----------------------+---------------------------------+
+| ``_CCCL_OS(WINDOWS)`` | Windows, including NVRTC LLP64  |
++-----------------------+---------------------------------+
+| ``_CCCL_OS(LINUX)``   | Linux, including NVRTC LP64     |
++-----------------------+---------------------------------+
+| ``_CCCL_OS(ANDROID)`` | Android                         |
++-----------------------+---------------------------------+
+| ``_CCCL_OS(QNX)``     | QNX                             |
++-----------------------+---------------------------------+
 
 ----
 
@@ -131,6 +131,8 @@ In addition, ``_CCCL_EXEC_CHECK_DISABLE`` disables the execution space check for
 
 Possible ``TARGET`` values:
 
++---------------------------+-------------------------------------------------------------------+
+| ``NV_ANY_TARGET``         | Any target                                                        |
 +---------------------------+-------------------------------------------------------------------+
 | ``NV_IS_HOST``            | Host-code target                                                  |
 +---------------------------+-------------------------------------------------------------------+
@@ -159,10 +161,8 @@ Usage example:
 
 ----
 
-CUDA Extension Macros
----------------------
-
-**CUDA attributes**:
+CUDA attributes
+---------------
 
 +------------------------------+----------------------------------------------------------+
 | ``_CCCL_GRID_CONSTANT``      | Grid constant kernel parameter                           |
@@ -170,19 +170,32 @@ CUDA Extension Macros
 | ``_CCCL_GLOBAL_CONSTANT``    | Host/device global scope constant (``inline constexpr``) |
 +------------------------------+----------------------------------------------------------+
 
-**Extended floating-point types**:
+----
 
-+------------------------------+-----------------------------------------------------------------------------------------------------------------+
-| ``_CCCL_HAS_NVFP16``         | `__half/__half2` data types are supported and enabled. Prefer over ``__CUDA_FP16_TYPES_EXIST__``                |
-+------------------------------+-----------------------------------------------------------------------------------------------------------------+
-| ``_CCCL_HAS_NVBF16``         | `__nv_bfloat16/__nv_bfloat162` data types are supported and enabled.  Prefer over ``__CUDA_BF16_TYPES_EXIST__`` |
-+------------------------------+-----------------------------------------------------------------------------------------------------------------+
+Non-standard Types Support
+--------------------------
 
-+------------------------------+----------------------------------------------------------------+
-| ``_LIBCUDACXX_HAS_NVFP16``   | `__half/__half2` host/device support  (CUDA 12.2)              |
-+------------------------------+----------------------------------------------------------------+
-| ``_LIBCUDACXX_HAS_NVBF16``   | `__nv_bfloat16/__nv_bfloat162` host/device support (CUDA 12.2) |
-+------------------------------+----------------------------------------------------------------+
++------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| ``_CCCL_HAS_INT128()``       | ``__int128`` and ``__uint128_t`` for 128-bit integer are supported and enabled                                                |
++------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| ``_CCCL_HAS_FLOAT128()``     | ``__float128`` for 128-bit floating-point are supported and enabled                                                           |
++------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| ``_CCCL_HAS_NVFP16``         | ``__half/__half2`` data types are supported and enabled. Prefer over ``__CUDA_FP16_TYPES_EXIST__``                            |
++------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| ``_CCCL_HAS_NVBF16``         | ``__nv_bfloat16/__nv_bfloat162`` data types are supported and enabled.  Prefer over ``__CUDA_BF16_TYPES_EXIST__``             |
++------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+| ``_CCCL_HAS_FP8()``          | ``__nv_fp8_e5m2/__nv_fp8_e4m3/__nv_fp8_e8m0`` data types are supported and enabled.  Prefer over ``__CUDA_FP8_TYPES_EXIST__`` |
++------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
+
++------------------------------+-------------------------------------------------------------------------+
+| ``_CCCL_DISABLE_INT128``     | Disable ``__int128/__uint128_t`` support                                |
++------------------------------+-------------------------------------------------------------------------+
+| ``_CCCL_DISABLE_FLOAT128``   | Disable ``__float128`` support                                          |
++------------------------------+-------------------------------------------------------------------------+
+| ``_LIBCUDACXX_HAS_NVFP16``   | ``__half/__half2`` host/device are supported  (CUDA 12.2+)              |
++------------------------------+-------------------------------------------------------------------------+
+| ``_LIBCUDACXX_HAS_NVBF16``   | ``__nv_bfloat16/__nv_bfloat162`` host/device are supported (CUDA 12.2+) |
++------------------------------+-------------------------------------------------------------------------+
 
 ----
 
@@ -244,6 +257,8 @@ Usage example:
 
 **Portable attributes**:
 
++----------------------------------+------------------------------------------------------------------------------+
+| ``_CCCL_ASSUME()``               | Portable ``[[assume]]`` attribute (before C++23)                             |
 +----------------------------------+------------------------------------------------------------------------------+
 | ``_CCCL_FALLTHROUGH()``          | Portable ``[[fallthrough]]`` attribute (before C++17)                        |
 +----------------------------------+------------------------------------------------------------------------------+
diff --git a/docs/cpp.rst b/docs/cpp.rst
index b0b92e520a5..74299a4697f 100644
--- a/docs/cpp.rst
+++ b/docs/cpp.rst
@@ -11,7 +11,7 @@ CUDA C++ Core Libraries
    CUB <https://nvidia.github.io/cccl/cub/>
    Thrust <https://nvidia.github.io/cccl/thrust/>
    Cuda Experimental <https://nvidia.github.io/cccl/cudax/>
-   CCCL development <cccl_development/index>
+   CCCL Development <cccl_development/index>
 
 Welcome to the CUDA Core Compute Libraries (CCCL) libraries for C++.
 

From c3948c438ae41627e0198d1f055c0aa86c628977 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Tue, 28 Jan 2025 05:43:17 -0800
Subject: [PATCH 3/3] Replace pipes.quote with shlex.quote in lit config
 (#3547)

* Replace pipes.quote with shlex.quote

* Drop TBB run on windows to unblock CI

* Update ci/matrix.yaml

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 ci/matrix.yaml                                  |  5 +++--
 libcudacxx/test/utils/libcudacxx/test/config.py | 11 +++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index d854303a253..6a98e8fc5b0 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -19,8 +19,9 @@ workflows:
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
-    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']}
-    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']}
+    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']}
+    # Disabled until we figure out the issue with the TBB dll
+    #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']}
     # Split up cub tests:
     - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']}
     - {jobs: ['test_lid1',  'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
diff --git a/libcudacxx/test/utils/libcudacxx/test/config.py b/libcudacxx/test/utils/libcudacxx/test/config.py
index c2b4871790e..af90b9fcbec 100644
--- a/libcudacxx/test/utils/libcudacxx/test/config.py
+++ b/libcudacxx/test/utils/libcudacxx/test/config.py
@@ -8,7 +8,6 @@
 
 import ctypes
 import os
-import pipes
 import platform
 import re
 import shlex
@@ -1512,14 +1511,14 @@ def configure_modules(self):
 
     def configure_substitutions(self):
         sub = self.config.substitutions
-        cxx_path = pipes.quote(self.cxx.path)
+        cxx_path = shlex.quote(self.cxx.path)
         # Configure compiler substitutions
         sub.append(("%cxx", cxx_path))
         sub.append(("%libcxx_src_root", self.libcudacxx_src_root))
         # Configure flags substitutions
-        flags_str = " ".join([pipes.quote(f) for f in self.cxx.flags])
-        compile_flags_str = " ".join([pipes.quote(f) for f in self.cxx.compile_flags])
-        link_flags_str = " ".join([pipes.quote(f) for f in self.cxx.link_flags])
+        flags_str = " ".join([shlex.quote(f) for f in self.cxx.flags])
+        compile_flags_str = " ".join([shlex.quote(f) for f in self.cxx.compile_flags])
+        link_flags_str = " ".join([shlex.quote(f) for f in self.cxx.link_flags])
         all_flags = "%s %s %s" % (flags_str, compile_flags_str, link_flags_str)
         sub.append(("%flags", flags_str))
         sub.append(("%compile_flags", compile_flags_str))
@@ -1548,7 +1547,7 @@ def configure_substitutions(self):
         sub.append(("%run", "%t.exe"))
         # Configure not program substitutions
         not_py = os.path.join(self.libcudacxx_src_root, "test", "utils", "not.py")
-        not_str = "%s %s " % (pipes.quote(sys.executable), pipes.quote(not_py))
+        not_str = "%s %s " % (shlex.quote(sys.executable), shlex.quote(not_py))
         sub.append(("not ", not_str))
         if self.get_lit_conf("libcudacxx_gdb"):
             sub.append(("%libcxx_gdb", self.get_lit_conf("libcudacxx_gdb")))