From a2972c08ab4bc53ea70ec78c53d784f166ce946f Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Wed, 30 Oct 2024 16:17:08 +0200
Subject: [PATCH 01/25] initial changes and support for 1d generic kernel

---
 src/CMakeLists.txt                            |   2 +
 src/include/miopen/tensor/invoke_params.hpp   |  97 +++++++++
 .../miopen/tensor/problem_description.hpp     | 152 ++++++++++++++
 src/include/miopen/tensor/solvers.hpp         |  62 ++++++
 src/include/miopen/tensor_ops.hpp             |  16 ++
 src/solver/tensor/TensorOp1dGeneric.cpp       | 194 ++++++++++++++++++
 src/tensor.cpp                                |  53 +++++
 src/tensor/problem_description.cpp            |  67 ++++++
 test/tensor_ops.cpp                           |  42 ++--
 9 files changed, 664 insertions(+), 21 deletions(-)
 create mode 100644 src/include/miopen/tensor/invoke_params.hpp
 create mode 100644 src/include/miopen/tensor/problem_description.hpp
 create mode 100644 src/include/miopen/tensor/solvers.hpp
 create mode 100644 src/solver/tensor/TensorOp1dGeneric.cpp
 create mode 100644 src/tensor/problem_description.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 92e4f4264a..4a92727cfd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -337,11 +337,13 @@ set( MIOpen_Source
     solver/softmarginloss/forward_softmarginloss.cpp
     solver/softmax/attn_softmax.cpp
     solver/softmax/softmax.cpp
+    solver/tensor/TensorOp1dGeneric.cpp
     subbuffers.cpp
     t5layernorm_api.cpp
     target_properties.cpp
     temp_file.cpp
     tensor.cpp
+    tensor/problem_description.cpp
     tensor_api.cpp
     transformers_adam_w_api.cpp
     seq_tensor.cpp
diff --git a/src/include/miopen/tensor/invoke_params.hpp b/src/include/miopen/tensor/invoke_params.hpp
new file mode 100644
index 0000000000..68c96fda3f
--- /dev/null
+++ b/src/include/miopen/tensor/invoke_params.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace tensor {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams(miopenTensorOp_t tensorOp_,
+                 const void* alpha0_,
+                 const TensorDescriptor& aTensorDesc_,
+                 ConstData_t ATensor_,
+                 const void* alpha1_,
+                 const TensorDescriptor& bTensorDesc_,
+                 ConstData_t BTensor_,
+                 const void* beta_,
+                 const TensorDescriptor& cTensorDesc_,
+                 Data_t CTensor_,
+                 const size_t Aoffset_,
+                 const size_t Boffset_,
+                 const size_t Coffset_,
+                 const bool nonStandardSquash_)
+        : alpha0(alpha0_),
+          alpha1(alpha1_),
+          beta(beta_),
+          tensorOp(tensorOp_),
+          aTensorDesc(aTensorDesc_),
+          ATensor(ATensor_),
+          bTensorDesc(bTensorDesc_),
+          BTensor(BTensor_),
+          cTensorDesc(cTensorDesc_),
+          CTensor(CTensor_),
+          Aoffset(Aoffset_),
+          Boffset(Boffset_),
+          Coffset(Coffset_),
+          nonStandardSquash(nonStandardSquash_)
+    {
+    }
+
+    size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+
+public:
+    const void* alpha0;
+    const void* alpha1;
+    const void* beta;
+
+    miopenTensorOp_t tensorOp;
+
+    TensorDescriptor aTensorDesc;
+    ConstData_t ATensor;
+
+    TensorDescriptor bTensorDesc;
+    ConstData_t BTensor;
+
+    TensorDescriptor cTensorDesc;
+    Data_t CTensor;
+
+    size_t Aoffset;
+    size_t Boffset;
+    size_t Coffset;
+
+    bool nonStandardSquash;
+};
+
+} // namespace tensor
+
+} // namespace miopen
diff --git a/src/include/miopen/tensor/problem_description.hpp b/src/include/miopen/tensor/problem_description.hpp
new file mode 100644
index 0000000000..41a9e4d848
--- /dev/null
+++ b/src/include/miopen/tensor/problem_description.hpp
@@ -0,0 +1,152 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace tensor {
+
+struct ProblemDescription : ProblemDescriptionBase
+{
+    ProblemDescription(const miopenTensorOp_t tensorOp_,
+                       const void* alpha0_,
+                       const void* alpha1_,
+                       const void* beta_,
+                       const TensorDescriptor& aTensorDesc_,
+                       const TensorDescriptor& bTensorDesc_,
+                       const TensorDescriptor& cTensorDesc_,
+                       const bool nonStandardSquash_)
+        : tensorOp(tensorOp_),
+          aTensorDesc(aTensorDesc_),
+          bTensorDesc(bTensorDesc_),
+          cTensorDesc(cTensorDesc_),
+          nonStandardSquash(nonStandardSquash_)
+    {
+        CheckAndAssignAlphaBeta(alpha0_, alpha1_, beta_);
+
+        if(aTensorDesc.GetElementSize() != cTensorDesc.GetElementSize())
+        {
+            MIOPEN_THROW("A and C Tensors do not match");
+        }
+
+        if(bTensorDesc.GetType() != cTensorDesc.GetType())
+        {
+            MIOPEN_THROW("Datatypes for B and C tensors do not match !");
+        }
+
+        auto blens = bTensorDesc.GetLengths();
+        auto clens = cTensorDesc.GetLengths();
+        if(clens.size() > 5)
+        {
+            MIOPEN_THROW("Tensor dimension larger than 5: " + std::to_string(clens.size()));
+        }
+
+        if(blens.size() != clens.size())
+        {
+            MIOPEN_THROW("Number of dims in B and C Tensors do not match: " +
+                         std::to_string(blens.size()) + ", " + std::to_string(clens.size()));
+        }
+
+        if(!nonStandardSquash)
+        {
+            for(std::size_t i = 0; i < clens.size(); i++)
+            {
+                if(blens[i] != 1 && blens[i] != clens[i])
+                {
+                    MIOPEN_THROW("BTensor dim != 1 && BTensor dim != CTensor dim: " +
+                                 std::to_string(i));
+                }
+            }
+        }
+        else
+        {
+            // non standard behavior because blens[1] can be not equalt to clens[1]
+            if(!(clens.size() == 3 && blens[0] == 1 && clens[0] == 1 && blens[2] == clens[2]))
+            {
+                MIOPEN_THROW(
+                    "Non standard squashed operation supported only for 3d tensors and for "
+                    "the specific configuration");
+            }
+        }
+    }
+
+    const miopenTensorOp_t GetTensorOp() const { return tensorOp; }
+
+    const void* GetAlpha0() const { return alpha0; }
+    const void* GetAlpha1() const { return alpha1; }
+    const void* GetBeta() const { return beta; }
+
+    const TensorDescriptor& GetATensorDesc() const { return aTensorDesc; }
+    const TensorDescriptor& GetBTensorDesc() const { return bTensorDesc; }
+    const TensorDescriptor& GetCTensorDesc() const { return cTensorDesc; }
+
+    const bool GetNonStandardSquash() const { return nonStandardSquash; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    void CheckAndAssignAlphaBeta(const void* alpha0_, const void* alpha1_, const void* beta_)
+    {
+        if(alpha0_ == nullptr)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Alpha0 value is nullptr");
+        }
+        if(alpha1_ == nullptr)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Alpha1 value is nullptr");
+        }
+        if(beta_ == nullptr)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr");
+        }
+
+        alpha0 = alpha0_;
+        alpha1 = alpha1_;
+        beta   = beta_;
+    }
+
+    const miopenTensorOp_t tensorOp;
+
+    const void* alpha0;
+    const void* alpha1;
+    const void* beta;
+
+    const TensorDescriptor& aTensorDesc;
+    const TensorDescriptor& bTensorDesc;
+    const TensorDescriptor& cTensorDesc;
+
+    const bool nonStandardSquash;
+};
+
+} // namespace tensor
+
+} // namespace miopen
diff --git a/src/include/miopen/tensor/solvers.hpp b/src/include/miopen/tensor/solvers.hpp
new file mode 100644
index 0000000000..072f708c84
--- /dev/null
+++ b/src/include/miopen/tensor/solvers.hpp
@@ -0,0 +1,62 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/solver.hpp>
+#include <miopen/tensor/problem_description.hpp>
+
+#include <utility>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensor {
+
+using TensorOpSolver = NonTunableSolverBase<ExecutionContext, miopen::tensor::ProblemDescription>;
+
+struct Op1dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op1dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensor::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensor::ProblemDescription& problem) const override;
+
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::tensor::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+} // namespace tensor
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/tensor_ops.hpp b/src/include/miopen/tensor_ops.hpp
index 25d838598b..fa5be24048 100644
--- a/src/include/miopen/tensor_ops.hpp
+++ b/src/include/miopen/tensor_ops.hpp
@@ -189,6 +189,22 @@ MIOPEN_INTERNALS_EXPORT void OpTensor(const Handle& handle,
                                       size_t Coffset         = 0,
                                       bool nonStandardSquash = false);
 
+MIOPEN_INTERNALS_EXPORT void OpTensorNew(Handle& handle,
+                                         miopenTensorOp_t tensorOp,
+                                         const void* alpha0,
+                                         const TensorDescriptor& aTensorDesc,
+                                         ConstData_t ATensor,
+                                         const void* alpha1,
+                                         const TensorDescriptor& bTensorDesc,
+                                         ConstData_t BTensor,
+                                         const void* beta,
+                                         const TensorDescriptor& cTensorDesc,
+                                         Data_t CTensor,
+                                         size_t Aoffset         = 0,
+                                         size_t Boffset         = 0,
+                                         size_t Coffset         = 0,
+                                         bool nonStandardSquash = false);
+
 MIOPEN_INTERNALS_EXPORT void CopyTensor(const Handle& handle,
                                         const TensorDescriptor& srcDesc,
                                         ConstData_t src,
diff --git a/src/solver/tensor/TensorOp1dGeneric.cpp b/src/solver/tensor/TensorOp1dGeneric.cpp
new file mode 100644
index 0000000000..3eebb7b950
--- /dev/null
+++ b/src/solver/tensor/TensorOp1dGeneric.cpp
@@ -0,0 +1,194 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/tensor/solvers.hpp>
+
+#include <miopen/tensor/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensor {
+
+bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
+                                     const miopen::tensor::ProblemDescription& problem) const
+{
+    auto aTensorDesc = problem.GetATensorDesc();
+    auto bTensorDesc = problem.GetBTensorDesc();
+    auto alens       = aTensorDesc.GetLengths();
+    auto blens       = bTensorDesc.GetLengths();
+    auto asize       = alens.size();
+
+    if(asize == 1)
+    {
+        return true;
+    }
+    if(asize == 2 && ((blens[0] == 1 && blens[1] == 1) || (blens[0] > 1 && blens[1] > 1)))
+    {
+        return true;
+    }
+    if(asize == 3 && ((blens[0] == 1 && blens[1] == 1 && blens[2] == 1) ||
+                      (blens[0] > 1 && blens[1] > 1 && blens[2] > 1)))
+    {
+        return true;
+    }
+    return false;
+}
+
+std::size_t
+Op1dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
+                                    const miopen::tensor::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
+                                            const miopen::tensor::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto aTensorDesc = problem.GetATensorDesc();
+    auto bTensorDesc = problem.GetBTensorDesc();
+    auto cTensorDesc = problem.GetCTensorDesc();
+
+    auto clens = cTensorDesc.GetLengths();
+
+    size_t local_threads = 256;
+    size_t max_num_wg    = 4096;
+
+    auto num_wg           = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg));
+    num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
+    size_t global_threads = num_wg * local_threads;
+
+    const std::vector<size_t> vld{local_threads, 1, 1};
+    const std::vector<size_t> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params =
+        KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
+
+    // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp()));
+
+    switch(problem.GetTensorOp())
+    {
+    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
+    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
+    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
+    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
+    }
+
+    if(aTensorDesc.AllDimsFitIntoInt())
+    {
+        build_params.Define("DIM_TYPE", "uint32_t");
+    }
+    else
+    {
+        build_params.Define("DIM_TYPE", "uint64_t");
+    }
+
+    build_params.Define("USE_1D_TENSOR_GENERIC");
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(
+        kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{});
+    kernel.kernel_file = "MIOpenTensorKernelsHip.cpp";
+    kernel.kernel_name = "Op1dTensorGeneric";
+
+    for(uint32_t i = 0; i <= 2; i++)
+    {
+        kernel.l_wk.push_back(vld[i]);
+        kernel.g_wk.push_back(vgd[i]);
+    }
+
+    result.invoker_factory = [=](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensor::InvokeParams>();
+
+            visit_float(bTensorDesc.GetType(), [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                auto blens = params.bTensorDesc.GetLengths();
+                auto clens = params.cTensorDesc.GetLengths();
+
+                auto astrides = params.aTensorDesc.GetStrides();
+                auto bstrides = params.bTensorDesc.GetStrides();
+                auto cstrides = params.cTensorDesc.GetStrides();
+
+                if(aTensorDesc.AllDimsFitIntoInt())
+                { // change offsets to 64bit after PR is merged
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           params.CTensor,
+                           static_cast<uint32_t>(params.Aoffset),
+                           static_cast<uint32_t>(params.Boffset),
+                           static_cast<uint32_t>(params.Coffset),
+                           static_cast<uint32_t>(astrides[0]),
+                           static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
+                           static_cast<uint32_t>(cstrides[0]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<uint32_t>(clens[0]),
+                           !float_equal(miopen_beta, 0.0));
+                }
+                else
+                {
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           params.CTensor,
+                           static_cast<uint32_t>(params.Aoffset),
+                           static_cast<uint32_t>(params.Boffset),
+                           static_cast<uint32_t>(params.Coffset),
+                           static_cast<uint64_t>(astrides[0]),
+                           static_cast<uint64_t>(blens[0] == 1 ? 0 : bstrides[0]),
+                           static_cast<uint64_t>(cstrides[0]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<uint64_t>(clens[0]),
+                           !float_equal(miopen_beta, 0.0));
+                }
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensor
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 3e5190bc25..89400c9166 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -28,6 +28,10 @@
 #include <miopen/errors.hpp>
 #include <miopen/logger.hpp>
 #include <miopen/tensor_layout.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/tensor/invoke_params.hpp>
+#include <miopen/tensor/solvers.hpp>
+#include <miopen/find_solution.hpp>
 
 #include <nlohmann/json.hpp>
 
@@ -868,6 +872,55 @@ void from_json(const nlohmann::json& j, TensorDescriptor& descriptor)
     j.at("type").get_to(descriptor.type);
 }
 
+void OpTensorNew(Handle& handle,
+                 miopenTensorOp_t tensorOp,
+                 const void* alpha0,
+                 const TensorDescriptor& aTensorDesc,
+                 ConstData_t ATensor,
+                 const void* alpha1,
+                 const TensorDescriptor& bTensorDesc,
+                 ConstData_t BTensor,
+                 const void* beta,
+                 const TensorDescriptor& cTensorDesc,
+                 Data_t CTensor,
+                 const size_t Aoffset,
+                 const size_t Boffset,
+                 const size_t Coffset,
+                 bool nonStandardSquash)
+{
+    if(ATensor == nullptr || BTensor == nullptr || CTensor == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm);
+    }
+
+    const auto problem = tensor::ProblemDescription{
+        tensorOp, alpha0, alpha1, beta, aTensorDesc, bTensorDesc, cTensorDesc, nonStandardSquash};
+
+    const auto invoke_params = tensor::InvokeParams{tensorOp,
+                                                    alpha0,
+                                                    aTensorDesc,
+                                                    ATensor,
+                                                    alpha1,
+                                                    bTensorDesc,
+                                                    BTensor,
+                                                    beta,
+                                                    cTensorDesc,
+                                                    CTensor,
+                                                    Aoffset,
+                                                    Boffset,
+                                                    Coffset,
+                                                    nonStandardSquash};
+
+    const auto tensor_dim = aTensorDesc.GetLengths().size();
+
+    if(tensor_dim == 1)
+    {
+        const auto algo    = AlgorithmName{"Op1dTensorGeneric"};
+        const auto solvers = solver::SolverContainer<solver::tensor::Op1dTensorGeneric>{};
+        solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+    }
+}
+
 } // namespace miopen
 
 int miopenGetTensorIndex(miopenTensorDescriptor_t tensorDesc, std::initializer_list<int> indices)
diff --git a/src/tensor/problem_description.cpp b/src/tensor/problem_description.cpp
new file mode 100644
index 0000000000..562efd6389
--- /dev/null
+++ b/src/tensor/problem_description.cpp
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/tensor/problem_description.hpp>
+#include <miopen/names.hpp>
+
+namespace miopen {
+
+namespace tensor {
+
+NetworkConfig ProblemDescription::MakeNetworkConfig() const
+{
+    std::ostringstream ss;
+    const auto tensor_dim = aTensorDesc.GetLengths().size();
+
+    ss << std::to_string(bTensorDesc.GetType()) << "-" << std::to_string(aTensorDesc.GetType())
+       << "-" << std::to_string(tensorOp);
+
+    if(tensor_dim == 1)
+    {
+        size_t local_threads = 256;
+        int max_num_wg       = 4096;
+        int num_wg =
+            std::clamp(cTensorDesc.GetLengths()[0] / local_threads, size_t(1), size_t(max_num_wg));
+        num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
+        size_t global_threads = num_wg * local_threads;
+        ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads);
+
+        if(aTensorDesc.AllDimsFitIntoInt())
+        {
+            ss << "-32bit";
+        }
+        else
+        {
+            ss << "-64bit";
+        }
+    }
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace tensor
+
+} // namespace miopen
diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp
index 3121715e8a..152443e4fc 100644
--- a/test/tensor_ops.cpp
+++ b/test/tensor_ops.cpp
@@ -181,24 +181,24 @@ struct verify_tensor_ops
         auto a_dev = handle.Write(a.data);
         auto b_dev = handle.Write(b.data);
 
-        miopen::OpTensor(handle,
-                         // miopenTensorOpAdd,
-                         // miopenTensorOpMax,
-                         // miopenTensorOpMin,
-                         miopenTensorOpMul,
-                         &alpha0,
-                         a.desc,
-                         a_dev.get(),
-                         &alpha1,
-                         b.desc,
-                         b_dev.get(),
-                         &beta,
-                         c.desc,
-                         c_dev.get(),
-                         Aoffset,
-                         Boffset,
-                         Coffset,
-                         false); // it does not verify non-standard behaviour
+        miopen::OpTensorNew(handle,
+                            // miopenTensorOpAdd,
+                            // miopenTensorOpMax,
+                            // miopenTensorOpMin,
+                            miopenTensorOpMul,
+                            &alpha0,
+                            a.desc,
+                            a_dev.get(),
+                            &alpha1,
+                            b.desc,
+                            b_dev.get(),
+                            &beta,
+                            c.desc,
+                            c_dev.get(),
+                            Aoffset,
+                            Boffset,
+                            Coffset,
+                            false); // it does not verify non-standard behaviour
 
         if(not no_validate)
         {
@@ -241,12 +241,12 @@ struct tensor_ops_driver : test_driver
 
     std::vector<std::vector<int>> get_sub_tensor_a()
     {
-        return {{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8}, {16, 8}, {8}};
+        return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8}, {16, 8},*/ {8}};
     }
 
     std::vector<std::vector<int>> get_sub_tensor_b()
     {
-        return {{32, 16, 8, 4, 4},
+        return {/*{32, 16, 8, 4, 4},
                 {32, 16, 1, 1, 1},
                 {1, 16, 8, 1, 1},
                 {1, 1, 8, 4, 1},
@@ -266,7 +266,7 @@ struct tensor_ops_driver : test_driver
                 {20, 1, 1},
                 {16, 8},
                 {16, 1},
-                {1, 8},
+                {1, 8},*/
                 {8},
                 {1}};
     }

From 75cecb268320a86ab2298e101d4c41c0cd68d788 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Wed, 30 Oct 2024 16:19:09 +0200
Subject: [PATCH 02/25] 1d solver file name change

---
 .../tensor/{TensorOp1dGeneric.cpp => Op1dTensorGeneric.cpp}       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/solver/tensor/{TensorOp1dGeneric.cpp => Op1dTensorGeneric.cpp} (100%)

diff --git a/src/solver/tensor/TensorOp1dGeneric.cpp b/src/solver/tensor/Op1dTensorGeneric.cpp
similarity index 100%
rename from src/solver/tensor/TensorOp1dGeneric.cpp
rename to src/solver/tensor/Op1dTensorGeneric.cpp

From 035989c0a2dd83a82a9e0cada7454b70b66ad680 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Wed, 30 Oct 2024 16:20:29 +0200
Subject: [PATCH 03/25] solver name change in cmakelists.txt

---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4a92727cfd..6726430535 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -337,7 +337,7 @@ set( MIOpen_Source
     solver/softmarginloss/forward_softmarginloss.cpp
     solver/softmax/attn_softmax.cpp
     solver/softmax/softmax.cpp
-    solver/tensor/TensorOp1dGeneric.cpp
+    solver/tensor/Op1dTensorGeneric.cpp
     subbuffers.cpp
     t5layernorm_api.cpp
     target_properties.cpp

From cf91070de6b0d58b310e34f38cddf4145d2f50b2 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Thu, 31 Oct 2024 15:53:30 +0200
Subject: [PATCH 04/25] more changes, 2d generic and 2d lite kernel

---
 src/CMakeLists.txt                      |   2 +
 src/include/miopen/tensor/solvers.hpp   |  32 ++++
 src/solver/tensor/Op1dTensorGeneric.cpp |  19 ++-
 src/solver/tensor/Op2dTensorGeneric.cpp | 173 +++++++++++++++++++
 src/solver/tensor/Op2dTensorLite.cpp    | 217 ++++++++++++++++++++++++
 src/tensor.cpp                          |  13 +-
 src/tensor/problem_description.cpp      |  44 ++++-
 test/tensor_ops.cpp                     |   6 +-
 8 files changed, 482 insertions(+), 24 deletions(-)
 create mode 100644 src/solver/tensor/Op2dTensorGeneric.cpp
 create mode 100644 src/solver/tensor/Op2dTensorLite.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6726430535..9dc938cae4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -338,6 +338,8 @@ set( MIOpen_Source
     solver/softmax/attn_softmax.cpp
     solver/softmax/softmax.cpp
     solver/tensor/Op1dTensorGeneric.cpp
+    solver/tensor/Op2dTensorGeneric.cpp
+    solver/tensor/Op2dTensorLite.cpp
     subbuffers.cpp
     t5layernorm_api.cpp
     target_properties.cpp
diff --git a/src/include/miopen/tensor/solvers.hpp b/src/include/miopen/tensor/solvers.hpp
index 072f708c84..d87d54d674 100644
--- a/src/include/miopen/tensor/solvers.hpp
+++ b/src/include/miopen/tensor/solvers.hpp
@@ -55,6 +55,38 @@ struct Op1dTensorGeneric final : TensorOpSolver
     bool MayNeedWorkspace() const override { return false; }
 };
 
+struct Op2dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op2dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensor::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensor::ProblemDescription& problem) const override;
+
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::tensor::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op2dTensorLite final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op2dTensorLite>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensor::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensor::ProblemDescription& problem) const override;
+
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::tensor::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
 } // namespace tensor
 
 } // namespace solver
diff --git a/src/solver/tensor/Op1dTensorGeneric.cpp b/src/solver/tensor/Op1dTensorGeneric.cpp
index 3eebb7b950..e550ec7952 100644
--- a/src/solver/tensor/Op1dTensorGeneric.cpp
+++ b/src/solver/tensor/Op1dTensorGeneric.cpp
@@ -51,15 +51,16 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
     {
         return true;
     }
-    if(asize == 2 && ((blens[0] == 1 && blens[1] == 1) || (blens[0] > 1 && blens[1] > 1)))
-    {
-        return true;
-    }
-    if(asize == 3 && ((blens[0] == 1 && blens[1] == 1 && blens[2] == 1) ||
-                      (blens[0] > 1 && blens[1] > 1 && blens[2] > 1)))
-    {
-        return true;
-    }
+    // add support for this later
+    // if(asize == 2 && ((blens[0] == 1 && blens[1] == 1) || (blens[0] > 1 && blens[1] > 1)))
+    // {
+    //     return true;
+    // }
+    // if(asize == 3 && ((blens[0] == 1 && blens[1] == 1 && blens[2] == 1) ||
+    //                   (blens[0] > 1 && blens[1] > 1 && blens[2] > 1)))
+    // {
+    //     return true;
+    // }
     return false;
 }
 
diff --git a/src/solver/tensor/Op2dTensorGeneric.cpp b/src/solver/tensor/Op2dTensorGeneric.cpp
new file mode 100644
index 0000000000..6155117ba9
--- /dev/null
+++ b/src/solver/tensor/Op2dTensorGeneric.cpp
@@ -0,0 +1,173 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/tensor/solvers.hpp>
+
+#include <miopen/tensor/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensor {
+
+bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context,
+                                     const miopen::tensor::ProblemDescription& problem) const
+{
+    auto aTensorDesc = problem.GetATensorDesc();
+    auto bTensorDesc = problem.GetBTensorDesc();
+    auto alens       = aTensorDesc.GetLengths();
+    auto blens       = bTensorDesc.GetLengths();
+    auto asize       = alens.size();
+
+    if(asize == 2)
+    {
+        return true;
+    }
+    // add applicable when asize == 3 and some special cases for b dimensions
+
+    return false;
+}
+
+std::size_t
+Op2dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
+                                    const miopen::tensor::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
+                                            const miopen::tensor::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto aTensorDesc = problem.GetATensorDesc();
+    auto bTensorDesc = problem.GetBTensorDesc();
+    auto cTensorDesc = problem.GetCTensorDesc();
+
+    auto clens = cTensorDesc.GetLengths();
+
+    size_t local_threads = 32;
+    size_t max_num_wg    = 4096;
+
+    auto num_wg = std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg));
+    num_wg      = num_wg > max_num_wg ? max_num_wg : num_wg;
+    size_t global_threads = num_wg * local_threads;
+
+    const std::vector<size_t> vld{local_threads, 1, 1};
+    const std::vector<size_t> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params =
+        KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
+
+    // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp()));
+
+    switch(problem.GetTensorOp())
+    {
+    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
+    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
+    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
+    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
+    }
+
+    // support for 64bit still not merged
+    // if(aTensorDesc.AllDimsFitIntoInt())
+    // {
+    //     build_params.Define("DIM_TYPE", "uint32_t");
+    // }
+    // else
+    // {
+    //     build_params.Define("DIM_TYPE", "uint64_t");
+    // }
+
+    build_params.Define("USE_2D_TENSOR_GENERIC");
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(
+        kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{});
+    kernel.kernel_file = "MIOpenTensorKernelsHip.cpp";
+    kernel.kernel_name = "Op2dTensorGeneric";
+
+    for(uint32_t i = 0; i <= 2; i++)
+    {
+        kernel.l_wk.push_back(vld[i]);
+        kernel.g_wk.push_back(vgd[i]);
+    }
+
+    result.invoker_factory = [=](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensor::InvokeParams>();
+
+            visit_float(bTensorDesc.GetType(), [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                auto blens = params.bTensorDesc.GetLengths();
+                auto clens = params.cTensorDesc.GetLengths();
+
+                auto astrides = params.aTensorDesc.GetStrides();
+                auto bstrides = params.bTensorDesc.GetStrides();
+                auto cstrides = params.cTensorDesc.GetStrides();
+
+                kernel(params.ATensor,
+                       params.BTensor,
+                       params.CTensor,
+                       static_cast<long>(params.Aoffset),
+                       static_cast<long>(params.Boffset),
+                       static_cast<long>(params.Coffset),
+                       static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]),
+                       static_cast<uint32_t>(clens[1]),
+                       static_cast<uint32_t>(astrides[0]),
+                       static_cast<uint32_t>(astrides[1]),
+                       static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
+                       static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),
+                       static_cast<uint32_t>(cstrides[0]),
+                       static_cast<uint32_t>(cstrides[1]),
+                       miopen_alpha0,
+                       miopen_alpha1,
+                       miopen_beta,
+                       static_cast<uint32_t>(clens[0]),
+                       !float_equal(miopen_beta, 0.0));
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensor
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensor/Op2dTensorLite.cpp b/src/solver/tensor/Op2dTensorLite.cpp
new file mode 100644
index 0000000000..712a32d49d
--- /dev/null
+++ b/src/solver/tensor/Op2dTensorLite.cpp
@@ -0,0 +1,217 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/tensor/solvers.hpp>
+
+#include <miopen/tensor/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensor {
+
+bool Op2dTensorLite::IsApplicable(const ExecutionContext& context,
+                                  const miopen::tensor::ProblemDescription& problem) const
+{
+    auto aTensorDesc = problem.GetATensorDesc();
+    auto bTensorDesc = problem.GetBTensorDesc();
+    auto cTensorDesc = problem.GetCTensorDesc();
+
+    auto alens = aTensorDesc.GetLengths();
+    auto blens = bTensorDesc.GetLengths();
+    auto clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    size_t local_threads = 256;
+    int max_num_wg       = 4096;
+
+    // for naive tensor ops
+    size_t RD_BLCK    = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
+    size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
+    size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
+
+    // opencl kernels are no longer supported, fallback to generic case
+    bool lite_applicable = grp_sz <= size_t(max_num_wg);
+
+    bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
+                   (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
+
+    if(asize == 3 && lite_applicable && is_lite)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t
+Op2dTensorLite::GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::tensor::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
+                                         const miopen::tensor::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto aTensorDesc = problem.GetATensorDesc();
+    auto bTensorDesc = problem.GetBTensorDesc();
+    auto cTensorDesc = problem.GetCTensorDesc();
+
+    auto alens = aTensorDesc.GetLengths();
+    auto blens = bTensorDesc.GetLengths();
+    auto clens = cTensorDesc.GetLengths();
+
+    auto astrides = aTensorDesc.GetStrides();
+    auto bstrides = bTensorDesc.GetStrides();
+    auto cstrides = cTensorDesc.GetStrides();
+
+    // first_not_one is incorrect if btensor size equal to 1
+    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+    auto d             = std::distance(blens.begin(), first_not_one.base());
+
+    // quick fix
+    int num_wg = first_not_one != blens.rend()
+                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
+                     : 1;
+
+    for(int i = (d - 2); i >= 0; i--)
+    {
+        if(blens[i] != 1)
+        {
+            num_wg *= blens[i];
+        }
+    }
+    int max_num_wg = 4096;
+    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    size_t local_threads = 256;
+
+    // for naive tensor ops
+    size_t RD_BLCK              = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
+    const std::string data_type = GetDataType(bTensorDesc.GetType());
+    const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
+
+    size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
+    size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
+
+    grp_sz        = std::min(size_t(max_num_wg), grp_sz);
+    size_t glb_sz = local_threads * grp_sz;
+
+    size_t local_threads2 = 64;
+    size_t total_work2    = clens[1];
+    size_t grp_sz2        = (total_work2 + local_threads2 - 1) / local_threads2;
+    grp_sz2               = std::min(size_t(max_num_wg / grp_sz), grp_sz2);
+    size_t glb_sz2        = local_threads2 * grp_sz2;
+
+    const std::vector<size_t> vld{local_threads, 1, 1};
+    const std::vector<size_t> vgd{glb_sz, glb_sz2, 1};
+
+    KernelBuildParameters build_params =
+        KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
+
+    // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp()));
+
+    switch(problem.GetTensorOp())
+    {
+    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
+    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
+    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
+    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
+    }
+
+    // support for 64bit still not merged
+    // if(aTensorDesc.AllDimsFitIntoInt())
+    // {
+    //     build_params.Define("DIM_TYPE", "uint32_t");
+    // }
+    // else
+    // {
+    //     build_params.Define("DIM_TYPE", "uint64_t");
+    // }
+
+    build_params.Define("USE_2D_TENSOR_LITE");
+    build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
+    build_params.Define("READ_TYPE", READ_TYPE);
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(
+        kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{});
+    kernel.kernel_file = "MIOpenTensorKernels.cl";
+    kernel.kernel_name = "Op2dTensorLite";
+
+    for(uint32_t i = 0; i <= 2; i++)
+    {
+        kernel.l_wk.push_back(vld[i]);
+        kernel.g_wk.push_back(vgd[i]);
+    }
+
+    result.invoker_factory = [=](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensor::InvokeParams>();
+
+            visit_float(bTensorDesc.GetType(), [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                kernel(params.ATensor,
+                       static_cast<int>(astrides[1]),
+                       params.BTensor,
+                       static_cast<int>(bstrides[1]),
+                       params.CTensor,
+                       static_cast<int>(cstrides[1]),
+                       miopen_alpha0,
+                       miopen_alpha1,
+                       miopen_beta,
+                       static_cast<int64_t>(params.Aoffset),
+                       static_cast<int64_t>(params.Boffset),
+                       static_cast<int64_t>(params.Coffset),
+                       static_cast<int>(!float_equal(miopen_beta, 0.0)),
+                       static_cast<int>(blens[1] == 1));
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensor
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 89400c9166..8280201c41 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -911,14 +911,11 @@ void OpTensorNew(Handle& handle,
                                                     Coffset,
                                                     nonStandardSquash};
 
-    const auto tensor_dim = aTensorDesc.GetLengths().size();
-
-    if(tensor_dim == 1)
-    {
-        const auto algo    = AlgorithmName{"Op1dTensorGeneric"};
-        const auto solvers = solver::SolverContainer<solver::tensor::Op1dTensorGeneric>{};
-        solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
-    }
+    const auto algo    = AlgorithmName{"TensorOpSolver"};
+    const auto solvers = solver::SolverContainer<solver::tensor::Op1dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensor::Op2dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensor::Op2dTensorLite>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
 }
 
 } // namespace miopen
diff --git a/src/tensor/problem_description.cpp b/src/tensor/problem_description.cpp
index 562efd6389..a3460ccff7 100644
--- a/src/tensor/problem_description.cpp
+++ b/src/tensor/problem_description.cpp
@@ -36,15 +36,19 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
     std::ostringstream ss;
     const auto tensor_dim = aTensorDesc.GetLengths().size();
 
+    auto alens = aTensorDesc.GetLengths();
+    auto blens = bTensorDesc.GetLengths();
+    auto clens = cTensorDesc.GetLengths();
+
+    size_t local_threads = 256;
+    int max_num_wg       = 4096;
+
     ss << std::to_string(bTensorDesc.GetType()) << "-" << std::to_string(aTensorDesc.GetType())
        << "-" << std::to_string(tensorOp);
 
     if(tensor_dim == 1)
     {
-        size_t local_threads = 256;
-        int max_num_wg       = 4096;
-        int num_wg =
-            std::clamp(cTensorDesc.GetLengths()[0] / local_threads, size_t(1), size_t(max_num_wg));
+        int num_wg            = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg));
         num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
         size_t global_threads = num_wg * local_threads;
         ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads);
@@ -58,6 +62,38 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
             ss << "-64bit";
         }
     }
+    else if(tensor_dim == 2)
+    {
+        local_threads = 32;
+        int num_wg =
+            std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg));
+        num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
+        size_t global_threads = num_wg * local_threads;
+        ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads);
+    }
+    else if(tensor_dim == 3)
+    {
+
+        size_t RD_BLCK        = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
+        size_t total_work     = std::max(clens[2] / RD_BLCK, size_t(1));
+        size_t grp_sz         = (total_work + local_threads - 1) / local_threads;
+        size_t local_threads2 = 64;
+        size_t total_work2    = clens[1];
+        size_t grp_sz2        = (total_work2 + local_threads2 - 1) / local_threads2;
+        grp_sz2               = std::min(size_t(max_num_wg / grp_sz), grp_sz2);
+
+        bool lite_applicable = grp_sz <= size_t(max_num_wg);
+
+        bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
+                       (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
+
+        if(lite_applicable && is_lite)
+        {
+            ss << "-" << std::to_string(RD_BLCK) << "x" << std::to_string(local_threads) << "x"
+               << std::to_string(grp_sz) << std::to_string(local_threads2)
+               << std::to_string(grp_sz2);
+        }
+    }
 
     return NetworkConfig{ss.str()};
 }
diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp
index 152443e4fc..b18c0276fa 100644
--- a/test/tensor_ops.cpp
+++ b/test/tensor_ops.cpp
@@ -241,7 +241,7 @@ struct tensor_ops_driver : test_driver
 
     std::vector<std::vector<int>> get_sub_tensor_a()
     {
-        return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8}, {16, 8},*/ {8}};
+        return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8},*/ {16, 8}, {8}};
     }
 
     std::vector<std::vector<int>> get_sub_tensor_b()
@@ -263,10 +263,10 @@ struct tensor_ops_driver : test_driver
                 {20, 16, 1},
                 {1, 16, 8},
                 {1, 16, 1},
-                {20, 1, 1},
+                {20, 1, 1},*/
                 {16, 8},
                 {16, 1},
-                {1, 8},*/
+                {1, 8},
                 {8},
                 {1}};
     }

From f2a11d648b1fd0fb1667535b64aae2e57d17b511 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Fri, 1 Nov 2024 18:32:08 +0200
Subject: [PATCH 05/25] some changes suggested in the comments

---
 src/CMakeLists.txt                            |   9 +-
 .../{tensor => tensorOp}/invoke_params.hpp    |  10 +-
 .../problem_description.hpp                   |  42 +---
 .../miopen/{tensor => tensorOp}/solvers.hpp   |  54 +++--
 src/include/miopen/tensor_ops.hpp             |  30 +--
 .../Op1dTensorGeneric.cpp                     |  69 +++---
 .../Op2dTensorGeneric.cpp                     |  23 +-
 .../{tensor => tensorOp}/Op2dTensorLite.cpp   |  42 ++--
 src/solver/tensorOp/Op2dTensorSquash.cpp      | 197 ++++++++++++++++++
 src/tensor.cpp                                |  85 ++++----
 src/tensor/problem_description.cpp            | 103 ---------
 src/tensorOp/problem_description.cpp          |  78 +++++++
 test/tensor_ops.cpp                           |  36 ++--
 13 files changed, 477 insertions(+), 301 deletions(-)
 rename src/include/miopen/{tensor => tensorOp}/invoke_params.hpp (94%)
 rename src/include/miopen/{tensor => tensorOp}/problem_description.hpp (82%)
 rename src/include/miopen/{tensor => tensorOp}/solvers.hpp (56%)
 rename src/solver/{tensor => tensorOp}/Op1dTensorGeneric.cpp (76%)
 rename src/solver/{tensor => tensorOp}/Op2dTensorGeneric.cpp (90%)
 rename src/solver/{tensor => tensorOp}/Op2dTensorLite.cpp (87%)
 create mode 100644 src/solver/tensorOp/Op2dTensorSquash.cpp
 delete mode 100644 src/tensor/problem_description.cpp
 create mode 100644 src/tensorOp/problem_description.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9dc938cae4..56b99b79cf 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -337,15 +337,16 @@ set( MIOpen_Source
     solver/softmarginloss/forward_softmarginloss.cpp
     solver/softmax/attn_softmax.cpp
     solver/softmax/softmax.cpp
-    solver/tensor/Op1dTensorGeneric.cpp
-    solver/tensor/Op2dTensorGeneric.cpp
-    solver/tensor/Op2dTensorLite.cpp
+    solver/tensorOp/Op1dTensorGeneric.cpp
+    solver/tensorOp/Op2dTensorGeneric.cpp
+    solver/tensorOp/Op2dTensorLite.cpp
+    solver/tensorOp/Op2dTensorSquash.cpp
     subbuffers.cpp
     t5layernorm_api.cpp
     target_properties.cpp
     temp_file.cpp
     tensor.cpp
-    tensor/problem_description.cpp
+    tensorOp/problem_description.cpp
     tensor_api.cpp
     transformers_adam_w_api.cpp
     seq_tensor.cpp
diff --git a/src/include/miopen/tensor/invoke_params.hpp b/src/include/miopen/tensorOp/invoke_params.hpp
similarity index 94%
rename from src/include/miopen/tensor/invoke_params.hpp
rename to src/include/miopen/tensorOp/invoke_params.hpp
index 68c96fda3f..99ff13da47 100644
--- a/src/include/miopen/tensor/invoke_params.hpp
+++ b/src/include/miopen/tensorOp/invoke_params.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -31,7 +31,7 @@
 
 namespace miopen {
 
-namespace tensor {
+namespace tensorOp {
 
 struct InvokeParams : public miopen::InvokeParams
 {
@@ -52,7 +52,7 @@ struct InvokeParams : public miopen::InvokeParams
         : alpha0(alpha0_),
           alpha1(alpha1_),
           beta(beta_),
-          tensorOp(tensorOp_),
+          tensorOperation(tensorOp_),
           aTensorDesc(aTensorDesc_),
           ATensor(ATensor_),
           bTensorDesc(bTensorDesc_),
@@ -74,7 +74,7 @@ struct InvokeParams : public miopen::InvokeParams
     const void* alpha1;
     const void* beta;
 
-    miopenTensorOp_t tensorOp;
+    miopenTensorOp_t tensorOperation;
 
     TensorDescriptor aTensorDesc;
     ConstData_t ATensor;
@@ -92,6 +92,6 @@ struct InvokeParams : public miopen::InvokeParams
     bool nonStandardSquash;
 };
 
-} // namespace tensor
+} // namespace tensorOp
 
 } // namespace miopen
diff --git a/src/include/miopen/tensor/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp
similarity index 82%
rename from src/include/miopen/tensor/problem_description.hpp
rename to src/include/miopen/tensorOp/problem_description.hpp
index 41a9e4d848..81621cfcbe 100644
--- a/src/include/miopen/tensor/problem_description.hpp
+++ b/src/include/miopen/tensorOp/problem_description.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,13 +33,11 @@ namespace miopen {
 
 struct NetworkConfig;
 
-namespace tensor {
+namespace tensorOp {
 
 struct ProblemDescription : ProblemDescriptionBase
 {
     ProblemDescription(const miopenTensorOp_t tensorOp_,
-                       const void* alpha0_,
-                       const void* alpha1_,
                        const void* beta_,
                        const TensorDescriptor& aTensorDesc_,
                        const TensorDescriptor& bTensorDesc_,
@@ -51,7 +49,11 @@ struct ProblemDescription : ProblemDescriptionBase
           cTensorDesc(cTensorDesc_),
           nonStandardSquash(nonStandardSquash_)
     {
-        CheckAndAssignAlphaBeta(alpha0_, alpha1_, beta_);
+        if(beta_ == nullptr)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr");
+        }
+        beta = *(static_cast<const float*>(beta_));
 
         if(aTensorDesc.GetElementSize() != cTensorDesc.GetElementSize())
         {
@@ -101,9 +103,7 @@ struct ProblemDescription : ProblemDescriptionBase
 
     const miopenTensorOp_t GetTensorOp() const { return tensorOp; }
 
-    const void* GetAlpha0() const { return alpha0; }
-    const void* GetAlpha1() const { return alpha1; }
-    const void* GetBeta() const { return beta; }
+    float GetBeta() const { return beta; }
 
     const TensorDescriptor& GetATensorDesc() const { return aTensorDesc; }
     const TensorDescriptor& GetBTensorDesc() const { return bTensorDesc; }
@@ -114,31 +114,9 @@ struct ProblemDescription : ProblemDescriptionBase
     NetworkConfig MakeNetworkConfig() const override;
 
 private:
-    void CheckAndAssignAlphaBeta(const void* alpha0_, const void* alpha1_, const void* beta_)
-    {
-        if(alpha0_ == nullptr)
-        {
-            MIOPEN_THROW(miopenStatusBadParm, "Alpha0 value is nullptr");
-        }
-        if(alpha1_ == nullptr)
-        {
-            MIOPEN_THROW(miopenStatusBadParm, "Alpha1 value is nullptr");
-        }
-        if(beta_ == nullptr)
-        {
-            MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr");
-        }
-
-        alpha0 = alpha0_;
-        alpha1 = alpha1_;
-        beta   = beta_;
-    }
-
     const miopenTensorOp_t tensorOp;
 
-    const void* alpha0;
-    const void* alpha1;
-    const void* beta;
+    float beta;
 
     const TensorDescriptor& aTensorDesc;
     const TensorDescriptor& bTensorDesc;
@@ -147,6 +125,6 @@ struct ProblemDescription : ProblemDescriptionBase
     const bool nonStandardSquash;
 };
 
-} // namespace tensor
+} // namespace tensorOp
 
 } // namespace miopen
diff --git a/src/include/miopen/tensor/solvers.hpp b/src/include/miopen/tensorOp/solvers.hpp
similarity index 56%
rename from src/include/miopen/tensor/solvers.hpp
rename to src/include/miopen/tensorOp/solvers.hpp
index d87d54d674..290a8b2cd9 100644
--- a/src/include/miopen/tensor/solvers.hpp
+++ b/src/include/miopen/tensorOp/solvers.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@
 #pragma once
 
 #include <miopen/solver.hpp>
-#include <miopen/tensor/problem_description.hpp>
+#include <miopen/tensorOp/problem_description.hpp>
 
 #include <utility>
 
@@ -35,22 +35,23 @@ namespace miopen {
 
 namespace solver {
 
-namespace tensor {
+namespace tensorOp {
 
-using TensorOpSolver = NonTunableSolverBase<ExecutionContext, miopen::tensor::ProblemDescription>;
+using TensorOpSolver = NonTunableSolverBase<ExecutionContext, miopen::tensorOp::ProblemDescription>;
 
 struct Op1dTensorGeneric final : TensorOpSolver
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<Op1dTensorGeneric>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::tensor::ProblemDescription& problem) const override;
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
 
     ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::tensor::ProblemDescription& problem) const override;
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
 
-    std::size_t GetWorkspaceSize(const ExecutionContext& context,
-                                 const miopen::tensor::ProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
 
     bool MayNeedWorkspace() const override { return false; }
 };
@@ -60,13 +61,14 @@ struct Op2dTensorGeneric final : TensorOpSolver
     const std::string& SolverDbId() const override { return GetSolverDbId<Op2dTensorGeneric>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::tensor::ProblemDescription& problem) const override;
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
 
     ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::tensor::ProblemDescription& problem) const override;
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
 
-    std::size_t GetWorkspaceSize(const ExecutionContext& context,
-                                 const miopen::tensor::ProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
 
     bool MayNeedWorkspace() const override { return false; }
 };
@@ -76,18 +78,36 @@ struct Op2dTensorLite final : TensorOpSolver
     const std::string& SolverDbId() const override { return GetSolverDbId<Op2dTensorLite>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::tensor::ProblemDescription& problem) const override;
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
 
     ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::tensor::ProblemDescription& problem) const override;
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
 
-    std::size_t GetWorkspaceSize(const ExecutionContext& context,
-                                 const miopen::tensor::ProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
 
     bool MayNeedWorkspace() const override { return false; }
 };
 
-} // namespace tensor
+struct Op2dTensorSquash final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op2dTensorSquash>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+} // namespace tensorOp
 
 } // namespace solver
 
diff --git a/src/include/miopen/tensor_ops.hpp b/src/include/miopen/tensor_ops.hpp
index fa5be24048..c19eb333f2 100644
--- a/src/include/miopen/tensor_ops.hpp
+++ b/src/include/miopen/tensor_ops.hpp
@@ -189,21 +189,21 @@ MIOPEN_INTERNALS_EXPORT void OpTensor(const Handle& handle,
                                       size_t Coffset         = 0,
                                       bool nonStandardSquash = false);
 
-MIOPEN_INTERNALS_EXPORT void OpTensorNew(Handle& handle,
-                                         miopenTensorOp_t tensorOp,
-                                         const void* alpha0,
-                                         const TensorDescriptor& aTensorDesc,
-                                         ConstData_t ATensor,
-                                         const void* alpha1,
-                                         const TensorDescriptor& bTensorDesc,
-                                         ConstData_t BTensor,
-                                         const void* beta,
-                                         const TensorDescriptor& cTensorDesc,
-                                         Data_t CTensor,
-                                         size_t Aoffset         = 0,
-                                         size_t Boffset         = 0,
-                                         size_t Coffset         = 0,
-                                         bool nonStandardSquash = false);
+MIOPEN_INTERNALS_EXPORT void OpTensor2(Handle& handle,
+                                       miopenTensorOp_t tensorOp,
+                                       const void* alpha0,
+                                       const TensorDescriptor& aTensorDesc,
+                                       ConstData_t ATensor,
+                                       const void* alpha1,
+                                       const TensorDescriptor& bTensorDesc,
+                                       ConstData_t BTensor,
+                                       const void* beta,
+                                       const TensorDescriptor& cTensorDesc,
+                                       Data_t CTensor,
+                                       size_t Aoffset         = 0,
+                                       size_t Boffset         = 0,
+                                       size_t Coffset         = 0,
+                                       bool nonStandardSquash = false);
 
 MIOPEN_INTERNALS_EXPORT void CopyTensor(const Handle& handle,
                                         const TensorDescriptor& srcDesc,
diff --git a/src/solver/tensor/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
similarity index 76%
rename from src/solver/tensor/Op1dTensorGeneric.cpp
rename to src/solver/tensorOp/Op1dTensorGeneric.cpp
index e550ec7952..54bb19e646 100644
--- a/src/solver/tensor/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -24,9 +24,9 @@
  *
  *******************************************************************************/
 
-#include <miopen/tensor/solvers.hpp>
+#include <miopen/tensorOp/solvers.hpp>
 
-#include <miopen/tensor/invoke_params.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/kernel_build_params.hpp>
 #include <miopen/float_equal.hpp>
@@ -36,10 +36,10 @@ namespace miopen {
 
 namespace solver {
 
-namespace tensor {
+namespace tensorOp {
 
 bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
-                                     const miopen::tensor::ProblemDescription& problem) const
+                                     const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto aTensorDesc = problem.GetATensorDesc();
     auto bTensorDesc = problem.GetBTensorDesc();
@@ -47,6 +47,11 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
     auto blens       = bTensorDesc.GetLengths();
     auto asize       = alens.size();
 
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
     if(asize == 1)
     {
         return true;
@@ -66,21 +71,22 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
 
 std::size_t
 Op1dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
-                                    const miopen::tensor::ProblemDescription& problem) const
+                                    const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
-ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
-                                            const miopen::tensor::ProblemDescription& problem) const
+ConvSolution
+Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
-    auto aTensorDesc = problem.GetATensorDesc();
-    auto bTensorDesc = problem.GetBTensorDesc();
-    auto cTensorDesc = problem.GetCTensorDesc();
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    auto clens = cTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
 
     size_t local_threads = 256;
     size_t max_num_wg    = 4096;
@@ -89,14 +95,12 @@ ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
     num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
     size_t global_threads = num_wg * local_threads;
 
-    const std::vector<size_t> vld{local_threads, 1, 1};
-    const std::vector<size_t> vgd{global_threads, 1, 1};
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
 
     KernelBuildParameters build_params =
         KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
 
-    // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp()));
-
     switch(problem.GetTensorOp())
     {
     case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
@@ -118,35 +122,34 @@ ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
 
     auto kernel = KernelInfo{};
 
-    kernel.comp_options = build_params.GenerateFor(
-        kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{});
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); //
+    GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{});
     kernel.kernel_file = "MIOpenTensorKernelsHip.cpp";
     kernel.kernel_name = "Op1dTensorGeneric";
 
-    for(uint32_t i = 0; i <= 2; i++)
-    {
-        kernel.l_wk.push_back(vld[i]);
-        kernel.g_wk.push_back(vgd[i]);
-    }
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [=](const std::vector<Kernel> kernels) {
+    result.invoker_factory = [](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::tensor::InvokeParams>();
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                auto blens = params.bTensorDesc.GetLengths();
-                auto clens = params.cTensorDesc.GetLengths();
+                const auto& blens = params.bTensorDesc.GetLengths();
+                const auto& clens = params.cTensorDesc.GetLengths();
 
-                auto astrides = params.aTensorDesc.GetStrides();
-                auto bstrides = params.bTensorDesc.GetStrides();
-                auto cstrides = params.cTensorDesc.GetStrides();
+                const auto& astrides = params.aTensorDesc.GetStrides();
+                const auto& bstrides = params.bTensorDesc.GetStrides();
+                const auto& cstrides = params.cTensorDesc.GetStrides();
 
-                if(aTensorDesc.AllDimsFitIntoInt())
+                if(params.aTensorDesc.AllDimsFitIntoInt())
                 { // change offsets to 64bit after PR is merged
                     kernel(params.ATensor,
                            params.BTensor,
@@ -188,7 +191,7 @@ ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
     return result;
 }
 
-} // namespace tensor
+} // namespace tensorOp
 
 } // namespace solver
 
diff --git a/src/solver/tensor/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
similarity index 90%
rename from src/solver/tensor/Op2dTensorGeneric.cpp
rename to src/solver/tensorOp/Op2dTensorGeneric.cpp
index 6155117ba9..640c3be115 100644
--- a/src/solver/tensor/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -24,9 +24,9 @@
  *
  *******************************************************************************/
 
-#include <miopen/tensor/solvers.hpp>
+#include <miopen/tensorOp/solvers.hpp>
 
-#include <miopen/tensor/invoke_params.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/kernel_build_params.hpp>
 #include <miopen/float_equal.hpp>
@@ -36,10 +36,10 @@ namespace miopen {
 
 namespace solver {
 
-namespace tensor {
+namespace tensorOp {
 
 bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context,
-                                     const miopen::tensor::ProblemDescription& problem) const
+                                     const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto aTensorDesc = problem.GetATensorDesc();
     auto bTensorDesc = problem.GetBTensorDesc();
@@ -58,13 +58,14 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context,
 
 std::size_t
 Op2dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
-                                    const miopen::tensor::ProblemDescription& problem) const
+                                    const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
-ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
-                                            const miopen::tensor::ProblemDescription& problem) const
+ConvSolution
+Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
@@ -87,8 +88,6 @@ ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
     KernelBuildParameters build_params =
         KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
 
-    // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp()));
-
     switch(problem.GetTensorOp())
     {
     case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
@@ -125,7 +124,7 @@ ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
     result.invoker_factory = [=](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::tensor::InvokeParams>();
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
             visit_float(bTensorDesc.GetType(), [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
@@ -166,7 +165,7 @@ ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
     return result;
 }
 
-} // namespace tensor
+} // namespace tensorOp
 
 } // namespace solver
 
diff --git a/src/solver/tensor/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
similarity index 87%
rename from src/solver/tensor/Op2dTensorLite.cpp
rename to src/solver/tensorOp/Op2dTensorLite.cpp
index 712a32d49d..9c53a3e99e 100644
--- a/src/solver/tensor/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -24,9 +24,9 @@
  *
  *******************************************************************************/
 
-#include <miopen/tensor/solvers.hpp>
+#include <miopen/tensorOp/solvers.hpp>
 
-#include <miopen/tensor/invoke_params.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/kernel_build_params.hpp>
 #include <miopen/float_equal.hpp>
@@ -36,10 +36,10 @@ namespace miopen {
 
 namespace solver {
 
-namespace tensor {
+namespace tensorOp {
 
 bool Op2dTensorLite::IsApplicable(const ExecutionContext& context,
-                                  const miopen::tensor::ProblemDescription& problem) const
+                                  const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto aTensorDesc = problem.GetATensorDesc();
     auto bTensorDesc = problem.GetBTensorDesc();
@@ -51,6 +51,11 @@ bool Op2dTensorLite::IsApplicable(const ExecutionContext& context,
 
     auto asize = alens.size();
 
+    if(asize < 3)
+    {
+        return false;
+    }
+
     size_t local_threads = 256;
     int max_num_wg       = 4096;
 
@@ -75,13 +80,13 @@ bool Op2dTensorLite::IsApplicable(const ExecutionContext& context,
 
 std::size_t
 Op2dTensorLite::GetWorkspaceSize(const ExecutionContext& context,
-                                 const miopen::tensor::ProblemDescription& problem) const
+                                 const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
-                                         const miopen::tensor::ProblemDescription& problem) const
+                                         const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
@@ -141,8 +146,6 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
     KernelBuildParameters build_params =
         KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
 
-    // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp()));
-
     switch(problem.GetTensorOp())
     {
     case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
@@ -151,26 +154,15 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
     case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
     }
 
-    // support for 64bit still not merged
-    // if(aTensorDesc.AllDimsFitIntoInt())
-    // {
-    //     build_params.Define("DIM_TYPE", "uint32_t");
-    // }
-    // else
-    // {
-    //     build_params.Define("DIM_TYPE", "uint64_t");
-    // }
-
     build_params.Define("USE_2D_TENSOR_LITE");
     build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
     build_params.Define("READ_TYPE", READ_TYPE);
 
     auto kernel = KernelInfo{};
 
-    kernel.comp_options = build_params.GenerateFor(
-        kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{});
-    kernel.kernel_file = "MIOpenTensorKernels.cl";
-    kernel.kernel_name = "Op2dTensorLite";
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op2dTensorLite";
 
     for(uint32_t i = 0; i <= 2; i++)
     {
@@ -181,7 +173,7 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
     result.invoker_factory = [=](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::tensor::InvokeParams>();
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
             visit_float(bTensorDesc.GetType(), [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
@@ -210,7 +202,7 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
     return result;
 }
 
-} // namespace tensor
+} // namespace tensorOp
 
 } // namespace solver
 
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
new file mode 100644
index 0000000000..687123ed0a
--- /dev/null
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -0,0 +1,197 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/tensorOp/solvers.hpp>
+
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context,
+                                    const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto aTensorDesc = problem.GetATensorDesc();
+    auto bTensorDesc = problem.GetBTensorDesc();
+    auto cTensorDesc = problem.GetCTensorDesc();
+
+    auto alens = aTensorDesc.GetLengths();
+    auto blens = bTensorDesc.GetLengths();
+    auto clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    if(asize < 3)
+    {
+        return false;
+    }
+
+    bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
+                   (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
+
+    bool is_squashed = problem.GetNonStandardSquash() && !is_lite &&
+                       (blens[0] == 1 && clens[0] == 1 && clens[1] == 1 && blens[2] == clens[2]);
+
+    if(asize == 3 && is_squashed)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t
+Op2dTensorSquash::GetWorkspaceSize(const ExecutionContext& context,
+                                   const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op2dTensorSquash::GetSolution(const ExecutionContext& context,
+                              const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto aTensorDesc = problem.GetATensorDesc();
+    auto bTensorDesc = problem.GetBTensorDesc();
+    auto cTensorDesc = problem.GetCTensorDesc();
+
+    auto alens = aTensorDesc.GetLengths();
+    auto blens = bTensorDesc.GetLengths();
+    auto clens = cTensorDesc.GetLengths();
+
+    auto astrides = aTensorDesc.GetStrides();
+    auto bstrides = bTensorDesc.GetStrides();
+    auto cstrides = cTensorDesc.GetStrides();
+
+    // first_not_one is incorrect if btensor size equal to 1
+    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+    auto d             = std::distance(blens.begin(), first_not_one.base());
+
+    // quick fix
+    int num_wg = first_not_one != blens.rend()
+                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
+                     : 1;
+
+    for(int i = (d - 2); i >= 0; i--)
+    {
+        if(blens[i] != 1)
+        {
+            num_wg *= blens[i];
+        }
+    }
+    int max_num_wg = 4096;
+    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    size_t local_threads = 256;
+
+    // for naive tensor ops
+    size_t RD_BLCK              = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
+    const std::string data_type = GetDataType(bTensorDesc.GetType());
+    const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
+
+    size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
+    size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
+
+    grp_sz        = std::min(size_t(max_num_wg), grp_sz);
+    size_t glb_sz = local_threads * grp_sz;
+
+    const std::vector<size_t> vld{local_threads, 1, 1};
+    const std::vector<size_t> vgd{glb_sz, 1, 1};
+
+    KernelBuildParameters build_params =
+        KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
+
+    switch(problem.GetTensorOp())
+    {
+    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
+    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
+    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
+    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
+    }
+
+    build_params.Define("USE_2D_TENSOR_SQUASH");
+    build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
+    build_params.Define("READ_TYPE", READ_TYPE);
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op2dTensorSquash";
+
+    for(uint32_t i = 0; i <= 2; i++)
+    {
+        kernel.l_wk.push_back(vld[i]);
+        kernel.g_wk.push_back(vgd[i]);
+    }
+
+    result.invoker_factory = [=](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(bTensorDesc.GetType(), [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                kernel(params.ATensor,
+                       params.BTensor,
+                       static_cast<int>(blens[1]),
+                       static_cast<int>(bstrides[1]),
+                       params.CTensor,
+                       miopen_alpha0,
+                       miopen_alpha1,
+                       miopen_beta,
+                       static_cast<int64_t>(params.Aoffset),
+                       static_cast<int64_t>(params.Boffset),
+                       static_cast<int64_t>(params.Coffset),
+                       static_cast<int64_t>(total_work),
+                       static_cast<int>(!float_equal(miopen_alpha0, 0.0)),
+                       static_cast<int>(!float_equal(miopen_alpha1, 0.0)),
+                       static_cast<int>(!float_equal(miopen_beta, 0.0)));
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 8280201c41..6c258c0e7d 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -29,8 +29,8 @@
 #include <miopen/logger.hpp>
 #include <miopen/tensor_layout.hpp>
 #include <miopen/handle.hpp>
-#include <miopen/tensor/invoke_params.hpp>
-#include <miopen/tensor/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensorOp/solvers.hpp>
 #include <miopen/find_solution.hpp>
 
 #include <nlohmann/json.hpp>
@@ -872,49 +872,60 @@ void from_json(const nlohmann::json& j, TensorDescriptor& descriptor)
     j.at("type").get_to(descriptor.type);
 }
 
-void OpTensorNew(Handle& handle,
-                 miopenTensorOp_t tensorOp,
-                 const void* alpha0,
-                 const TensorDescriptor& aTensorDesc,
-                 ConstData_t ATensor,
-                 const void* alpha1,
-                 const TensorDescriptor& bTensorDesc,
-                 ConstData_t BTensor,
-                 const void* beta,
-                 const TensorDescriptor& cTensorDesc,
-                 Data_t CTensor,
-                 const size_t Aoffset,
-                 const size_t Boffset,
-                 const size_t Coffset,
-                 bool nonStandardSquash)
+void OpTensor2(Handle& handle,
+               miopenTensorOp_t tensorOp,
+               const void* alpha0,
+               const TensorDescriptor& aTensorDesc,
+               ConstData_t ATensor,
+               const void* alpha1,
+               const TensorDescriptor& bTensorDesc,
+               ConstData_t BTensor,
+               const void* beta,
+               const TensorDescriptor& cTensorDesc,
+               Data_t CTensor,
+               const size_t Aoffset,
+               const size_t Boffset,
+               const size_t Coffset,
+               bool nonStandardSquash)
 {
     if(ATensor == nullptr || BTensor == nullptr || CTensor == nullptr)
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
 
-    const auto problem = tensor::ProblemDescription{
-        tensorOp, alpha0, alpha1, beta, aTensorDesc, bTensorDesc, cTensorDesc, nonStandardSquash};
-
-    const auto invoke_params = tensor::InvokeParams{tensorOp,
-                                                    alpha0,
-                                                    aTensorDesc,
-                                                    ATensor,
-                                                    alpha1,
-                                                    bTensorDesc,
-                                                    BTensor,
-                                                    beta,
-                                                    cTensorDesc,
-                                                    CTensor,
-                                                    Aoffset,
-                                                    Boffset,
-                                                    Coffset,
-                                                    nonStandardSquash};
+    if(alpha0 == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Alpha0 value is nullptr");
+    }
+
+    if(alpha1 == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Alpha1 value is nullptr");
+    }
+
+    const auto problem = tensorOp::ProblemDescription{
+        tensorOp, beta, aTensorDesc, bTensorDesc, cTensorDesc, nonStandardSquash};
+
+    const auto invoke_params = tensorOp::InvokeParams{tensorOp,
+                                                      alpha0,
+                                                      aTensorDesc,
+                                                      ATensor,
+                                                      alpha1,
+                                                      bTensorDesc,
+                                                      BTensor,
+                                                      beta,
+                                                      cTensorDesc,
+                                                      CTensor,
+                                                      Aoffset,
+                                                      Boffset,
+                                                      Coffset,
+                                                      nonStandardSquash};
 
     const auto algo    = AlgorithmName{"TensorOpSolver"};
-    const auto solvers = solver::SolverContainer<solver::tensor::Op1dTensorGeneric>{} +
-                         solver::SolverContainer<solver::tensor::Op2dTensorGeneric>{} +
-                         solver::SolverContainer<solver::tensor::Op2dTensorLite>{};
+    const auto solvers = solver::SolverContainer<solver::tensorOp::Op2dTensorLite>{} +
+                         solver::SolverContainer<solver::tensorOp::Op2dTensorSquash>{} +
+                         solver::SolverContainer<solver::tensorOp::Op1dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensorOp::Op2dTensorGeneric>{};
     solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
 }
 
diff --git a/src/tensor/problem_description.cpp b/src/tensor/problem_description.cpp
deleted file mode 100644
index a3460ccff7..0000000000
--- a/src/tensor/problem_description.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <miopen/tensor/problem_description.hpp>
-#include <miopen/names.hpp>
-
-namespace miopen {
-
-namespace tensor {
-
-NetworkConfig ProblemDescription::MakeNetworkConfig() const
-{
-    std::ostringstream ss;
-    const auto tensor_dim = aTensorDesc.GetLengths().size();
-
-    auto alens = aTensorDesc.GetLengths();
-    auto blens = bTensorDesc.GetLengths();
-    auto clens = cTensorDesc.GetLengths();
-
-    size_t local_threads = 256;
-    int max_num_wg       = 4096;
-
-    ss << std::to_string(bTensorDesc.GetType()) << "-" << std::to_string(aTensorDesc.GetType())
-       << "-" << std::to_string(tensorOp);
-
-    if(tensor_dim == 1)
-    {
-        int num_wg            = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg));
-        num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
-        size_t global_threads = num_wg * local_threads;
-        ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads);
-
-        if(aTensorDesc.AllDimsFitIntoInt())
-        {
-            ss << "-32bit";
-        }
-        else
-        {
-            ss << "-64bit";
-        }
-    }
-    else if(tensor_dim == 2)
-    {
-        local_threads = 32;
-        int num_wg =
-            std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg));
-        num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
-        size_t global_threads = num_wg * local_threads;
-        ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads);
-    }
-    else if(tensor_dim == 3)
-    {
-
-        size_t RD_BLCK        = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
-        size_t total_work     = std::max(clens[2] / RD_BLCK, size_t(1));
-        size_t grp_sz         = (total_work + local_threads - 1) / local_threads;
-        size_t local_threads2 = 64;
-        size_t total_work2    = clens[1];
-        size_t grp_sz2        = (total_work2 + local_threads2 - 1) / local_threads2;
-        grp_sz2               = std::min(size_t(max_num_wg / grp_sz), grp_sz2);
-
-        bool lite_applicable = grp_sz <= size_t(max_num_wg);
-
-        bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
-                       (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
-
-        if(lite_applicable && is_lite)
-        {
-            ss << "-" << std::to_string(RD_BLCK) << "x" << std::to_string(local_threads) << "x"
-               << std::to_string(grp_sz) << std::to_string(local_threads2)
-               << std::to_string(grp_sz2);
-        }
-    }
-
-    return NetworkConfig{ss.str()};
-}
-
-} // namespace tensor
-
-} // namespace miopen
diff --git a/src/tensorOp/problem_description.cpp b/src/tensorOp/problem_description.cpp
new file mode 100644
index 0000000000..dc16276f05
--- /dev/null
+++ b/src/tensorOp/problem_description.cpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/tensorOp/problem_description.hpp>
+#include <miopen/names.hpp>
+
+namespace miopen {
+
+namespace tensorOp {
+
+NetworkConfig ProblemDescription::MakeNetworkConfig() const
+{
+    std::ostringstream ss;
+
+    auto alens = aTensorDesc.GetLengths();
+    auto blens = bTensorDesc.GetLengths();
+
+    auto astrides = aTensorDesc.GetStrides();
+    auto bstrides = bTensorDesc.GetStrides();
+    auto cstrides = cTensorDesc.GetStrides();
+
+    std::string alens_str{};
+    std::string blens_str{};
+    std::string astrides_str{};
+    std::string bstrides_str{};
+    std::string cstrides_str{};
+
+    for(uint32_t i = 0; i < alens.size(); i++)
+    {
+        alens_str += std::to_string(alens[i]);
+        blens_str += std::to_string(blens[i]);
+        astrides_str += std::to_string(astrides[i]);
+        bstrides_str += std::to_string(bstrides[i]);
+        cstrides_str += std::to_string(cstrides[i]);
+
+        if(i != (alens.size() - 1))
+        {
+            alens_str += "x";
+            blens_str += "x";
+            astrides_str += "x";
+            bstrides_str += "x";
+            cstrides_str += "x";
+        }
+    }
+
+    ss << std::to_string(aTensorDesc.GetType()) << "-" << std::to_string(tensorOp) << "-"
+       << alens_str << "-" << blens_str << "-" << astrides_str << "-" << bstrides_str << "-"
+       << cstrides_str << "-" << std::to_string((beta == 0));
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace tensorOp
+
+} // namespace miopen
diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp
index b18c0276fa..c93d1b1eb1 100644
--- a/test/tensor_ops.cpp
+++ b/test/tensor_ops.cpp
@@ -181,24 +181,24 @@ struct verify_tensor_ops
         auto a_dev = handle.Write(a.data);
         auto b_dev = handle.Write(b.data);
 
-        miopen::OpTensorNew(handle,
-                            // miopenTensorOpAdd,
-                            // miopenTensorOpMax,
-                            // miopenTensorOpMin,
-                            miopenTensorOpMul,
-                            &alpha0,
-                            a.desc,
-                            a_dev.get(),
-                            &alpha1,
-                            b.desc,
-                            b_dev.get(),
-                            &beta,
-                            c.desc,
-                            c_dev.get(),
-                            Aoffset,
-                            Boffset,
-                            Coffset,
-                            false); // it does not verify non-standard behaviour
+        miopen::OpTensor2(handle,
+                          // miopenTensorOpAdd,
+                          // miopenTensorOpMax,
+                          // miopenTensorOpMin,
+                          miopenTensorOpMul,
+                          &alpha0,
+                          a.desc,
+                          a_dev.get(),
+                          &alpha1,
+                          b.desc,
+                          b_dev.get(),
+                          &beta,
+                          c.desc,
+                          c_dev.get(),
+                          Aoffset,
+                          Boffset,
+                          Coffset,
+                          false); // it does not verify non-standard behaviour
 
         if(not no_validate)
         {

From ac13ff30e6b20b415d1fcb4756b5bf87ed95ecf2 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Mon, 4 Nov 2024 17:07:48 +0200
Subject: [PATCH 06/25] additional changes

---
 src/CMakeLists.txt                        |   1 +
 src/include/miopen/tensorOp/solvers.hpp   |  17 ++
 src/solver/tensorOp/Op1dTensorGeneric.cpp |  35 +---
 src/solver/tensorOp/Op2dTensorGeneric.cpp |  73 ++++-----
 src/solver/tensorOp/Op2dTensorLite.cpp    |  77 +++++----
 src/solver/tensorOp/Op2dTensorSquash.cpp  |  73 ++++-----
 src/solver/tensorOp/Op3dTensorGeneric.cpp | 186 ++++++++++++++++++++++
 src/solver/tensorOp/tensor_op_helpers.hpp |  77 +++++++++
 src/tensor.cpp                            |   3 +-
 test/tensor_ops.cpp                       |   6 +-
 10 files changed, 393 insertions(+), 155 deletions(-)
 create mode 100644 src/solver/tensorOp/Op3dTensorGeneric.cpp
 create mode 100644 src/solver/tensorOp/tensor_op_helpers.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 56b99b79cf..85912d3b6f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -341,6 +341,7 @@ set( MIOpen_Source
     solver/tensorOp/Op2dTensorGeneric.cpp
     solver/tensorOp/Op2dTensorLite.cpp
     solver/tensorOp/Op2dTensorSquash.cpp
+    solver/tensorOp/Op3dTensorGeneric.cpp
     subbuffers.cpp
     t5layernorm_api.cpp
     target_properties.cpp
diff --git a/src/include/miopen/tensorOp/solvers.hpp b/src/include/miopen/tensorOp/solvers.hpp
index 290a8b2cd9..9eb9e187b6 100644
--- a/src/include/miopen/tensorOp/solvers.hpp
+++ b/src/include/miopen/tensorOp/solvers.hpp
@@ -107,6 +107,23 @@ struct Op2dTensorSquash final : TensorOpSolver
     bool MayNeedWorkspace() const override { return false; }
 };
 
+struct Op3dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op3dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
 } // namespace tensorOp
 
 } // namespace solver
diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index 54bb19e646..640f9968e2 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
+#include "tensor_op_helpers.hpp"
 #include <miopen/tensorOp/solvers.hpp>
-
 #include <miopen/tensorOp/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/kernel_build_params.hpp>
@@ -41,10 +41,10 @@ namespace tensorOp {
 bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
-    auto aTensorDesc = problem.GetATensorDesc();
-    auto bTensorDesc = problem.GetBTensorDesc();
-    auto alens       = aTensorDesc.GetLengths();
-    auto blens       = bTensorDesc.GetLengths();
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    // const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& alens = aTensorDesc.GetLengths();
+    // const auto& blens       = bTensorDesc.GetLengths();
     auto asize       = alens.size();
 
     if(GetDataType(aTensorDesc.GetType()) == "double")
@@ -82,8 +82,6 @@ Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
-    const auto& aTensorDesc = problem.GetATensorDesc();
-    const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
     const auto& clens = cTensorDesc.GetLengths();
@@ -98,32 +96,15 @@ Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
     const std::array<size_t, 3> vld{local_threads, 1, 1};
     const std::array<size_t, 3> vgd{global_threads, 1, 1};
 
-    KernelBuildParameters build_params =
-        KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
-
-    switch(problem.GetTensorOp())
-    {
-    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
-    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
-    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
-    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
-    }
+    KernelBuildParameters build_params = KernelBuildParameters{};
 
-    if(aTensorDesc.AllDimsFitIntoInt())
-    {
-        build_params.Define("DIM_TYPE", "uint32_t");
-    }
-    else
-    {
-        build_params.Define("DIM_TYPE", "uint64_t");
-    }
+    GetCommonParams(build_params, problem, true);
 
     build_params.Define("USE_1D_TENSOR_GENERIC");
 
     auto kernel = KernelInfo{};
 
-    kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); //
-    GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{});
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
     kernel.kernel_file = "MIOpenTensorKernelsHip.cpp";
     kernel.kernel_name = "Op1dTensorGeneric";
 
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index 640c3be115..92a19d6a99 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -23,9 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-
+#include "tensor_op_helpers.hpp"
 #include <miopen/tensorOp/solvers.hpp>
-
 #include <miopen/tensorOp/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/kernel_build_params.hpp>
@@ -41,12 +40,17 @@ namespace tensorOp {
 bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
-    auto aTensorDesc = problem.GetATensorDesc();
-    auto bTensorDesc = problem.GetBTensorDesc();
-    auto alens       = aTensorDesc.GetLengths();
-    auto blens       = bTensorDesc.GetLengths();
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    // const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& alens = aTensorDesc.GetLengths();
+    // const auto& blens       = bTensorDesc.GetLengths();
     auto asize       = alens.size();
 
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
     if(asize == 2)
     {
         return true;
@@ -69,11 +73,9 @@ Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
-    auto aTensorDesc = problem.GetATensorDesc();
-    auto bTensorDesc = problem.GetBTensorDesc();
-    auto cTensorDesc = problem.GetCTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    auto clens = cTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
 
     size_t local_threads = 32;
     size_t max_num_wg    = 4096;
@@ -82,61 +84,42 @@ Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
     num_wg      = num_wg > max_num_wg ? max_num_wg : num_wg;
     size_t global_threads = num_wg * local_threads;
 
-    const std::vector<size_t> vld{local_threads, 1, 1};
-    const std::vector<size_t> vgd{global_threads, 1, 1};
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
 
-    KernelBuildParameters build_params =
-        KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
+    KernelBuildParameters build_params = KernelBuildParameters{};
 
-    switch(problem.GetTensorOp())
-    {
-    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
-    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
-    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
-    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
-    }
-
-    // support for 64bit still not merged
-    // if(aTensorDesc.AllDimsFitIntoInt())
-    // {
-    //     build_params.Define("DIM_TYPE", "uint32_t");
-    // }
-    // else
-    // {
-    //     build_params.Define("DIM_TYPE", "uint64_t");
-    // }
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_2D_TENSOR_GENERIC");
 
     auto kernel = KernelInfo{};
 
-    kernel.comp_options = build_params.GenerateFor(
-        kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{});
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
     kernel.kernel_file = "MIOpenTensorKernelsHip.cpp";
     kernel.kernel_name = "Op2dTensorGeneric";
 
-    for(uint32_t i = 0; i <= 2; i++)
-    {
-        kernel.l_wk.push_back(vld[i]);
-        kernel.g_wk.push_back(vgd[i]);
-    }
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [=](const std::vector<Kernel> kernels) {
+    result.invoker_factory = [](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                auto blens = params.bTensorDesc.GetLengths();
-                auto clens = params.cTensorDesc.GetLengths();
+                const auto& blens = params.bTensorDesc.GetLengths();
+                const auto& clens = params.cTensorDesc.GetLengths();
 
-                auto astrides = params.aTensorDesc.GetStrides();
-                auto bstrides = params.bTensorDesc.GetStrides();
-                auto cstrides = params.cTensorDesc.GetStrides();
+                const auto& astrides = params.aTensorDesc.GetStrides();
+                const auto& bstrides = params.bTensorDesc.GetStrides();
+                const auto& cstrides = params.cTensorDesc.GetStrides();
 
                 kernel(params.ATensor,
                        params.BTensor,
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
index 9c53a3e99e..8782fe1b29 100644
--- a/src/solver/tensorOp/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -23,9 +23,9 @@
  * SOFTWARE.
  *
  *******************************************************************************/
+#include "tensor_op_helpers.hpp"
 
 #include <miopen/tensorOp/solvers.hpp>
-
 #include <miopen/tensorOp/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/kernel_build_params.hpp>
@@ -41,16 +41,21 @@ namespace tensorOp {
 bool Op2dTensorLite::IsApplicable(const ExecutionContext& context,
                                   const miopen::tensorOp::ProblemDescription& problem) const
 {
-    auto aTensorDesc = problem.GetATensorDesc();
-    auto bTensorDesc = problem.GetBTensorDesc();
-    auto cTensorDesc = problem.GetCTensorDesc();
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    auto alens = aTensorDesc.GetLengths();
-    auto blens = bTensorDesc.GetLengths();
-    auto clens = cTensorDesc.GetLengths();
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
 
     auto asize = alens.size();
 
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
     if(asize < 3)
     {
         return false;
@@ -90,17 +95,11 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
-    auto aTensorDesc = problem.GetATensorDesc();
-    auto bTensorDesc = problem.GetBTensorDesc();
-    auto cTensorDesc = problem.GetCTensorDesc();
-
-    auto alens = aTensorDesc.GetLengths();
-    auto blens = bTensorDesc.GetLengths();
-    auto clens = cTensorDesc.GetLengths();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    auto astrides = aTensorDesc.GetStrides();
-    auto bstrides = bTensorDesc.GetStrides();
-    auto cstrides = cTensorDesc.GetStrides();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
 
     // first_not_one is incorrect if btensor size equal to 1
     auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
@@ -124,9 +123,9 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
     size_t local_threads = 256;
 
     // for naive tensor ops
-    size_t RD_BLCK              = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
-    const std::string data_type = GetDataType(bTensorDesc.GetType());
-    const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
+    size_t RD_BLCK        = size_t(1);
+    std::string READ_TYPE = "";
+    GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType(), RD_BLCK, READ_TYPE);
 
     size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
     size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
@@ -140,19 +139,12 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
     grp_sz2               = std::min(size_t(max_num_wg / grp_sz), grp_sz2);
     size_t glb_sz2        = local_threads2 * grp_sz2;
 
-    const std::vector<size_t> vld{local_threads, 1, 1};
-    const std::vector<size_t> vgd{glb_sz, glb_sz2, 1};
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{glb_sz, glb_sz2, 1};
 
-    KernelBuildParameters build_params =
-        KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
+    KernelBuildParameters build_params = KernelBuildParameters{};
 
-    switch(problem.GetTensorOp())
-    {
-    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
-    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
-    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
-    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
-    }
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_2D_TENSOR_LITE");
     build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
@@ -160,26 +152,31 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
 
     auto kernel = KernelInfo{};
 
-    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
     kernel.kernel_file  = "MIOpenTensorKernels.cl";
     kernel.kernel_name  = "Op2dTensorLite";
 
-    for(uint32_t i = 0; i <= 2; i++)
-    {
-        kernel.l_wk.push_back(vld[i]);
-        kernel.g_wk.push_back(vgd[i]);
-    }
+    using std::begin, std::end;
 
-    result.invoker_factory = [=](const std::vector<Kernel> kernels) {
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory = [total_work, total_work2](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
+                const auto& blens = params.bTensorDesc.GetLengths();
+
+                const auto& astrides = params.aTensorDesc.GetStrides();
+                const auto& bstrides = params.bTensorDesc.GetStrides();
+                const auto& cstrides = params.cTensorDesc.GetStrides();
+
                 kernel(params.ATensor,
                        static_cast<int>(astrides[1]),
                        params.BTensor,
@@ -192,6 +189,8 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
                        static_cast<int64_t>(params.Aoffset),
                        static_cast<int64_t>(params.Boffset),
                        static_cast<int64_t>(params.Coffset),
+                       static_cast<int64_t>(total_work),
+                       static_cast<int64_t>(total_work2),
                        static_cast<int>(!float_equal(miopen_beta, 0.0)),
                        static_cast<int>(blens[1] == 1));
             });
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
index 687123ed0a..0368592cf7 100644
--- a/src/solver/tensorOp/Op2dTensorSquash.cpp
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -23,9 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-
+#include "tensor_op_helpers.hpp"
 #include <miopen/tensorOp/solvers.hpp>
-
 #include <miopen/tensorOp/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/kernel_build_params.hpp>
@@ -41,16 +40,21 @@ namespace tensorOp {
 bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context,
                                     const miopen::tensorOp::ProblemDescription& problem) const
 {
-    auto aTensorDesc = problem.GetATensorDesc();
-    auto bTensorDesc = problem.GetBTensorDesc();
-    auto cTensorDesc = problem.GetCTensorDesc();
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    auto alens = aTensorDesc.GetLengths();
-    auto blens = bTensorDesc.GetLengths();
-    auto clens = cTensorDesc.GetLengths();
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
 
     auto asize = alens.size();
 
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
     if(asize < 3)
     {
         return false;
@@ -83,17 +87,11 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
-    auto aTensorDesc = problem.GetATensorDesc();
-    auto bTensorDesc = problem.GetBTensorDesc();
-    auto cTensorDesc = problem.GetCTensorDesc();
-
-    auto alens = aTensorDesc.GetLengths();
-    auto blens = bTensorDesc.GetLengths();
-    auto clens = cTensorDesc.GetLengths();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    auto astrides = aTensorDesc.GetStrides();
-    auto bstrides = bTensorDesc.GetStrides();
-    auto cstrides = cTensorDesc.GetStrides();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
 
     // first_not_one is incorrect if btensor size equal to 1
     auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
@@ -117,9 +115,9 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context,
     size_t local_threads = 256;
 
     // for naive tensor ops
-    size_t RD_BLCK              = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
-    const std::string data_type = GetDataType(bTensorDesc.GetType());
-    const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
+    size_t RD_BLCK        = size_t(1);
+    std::string READ_TYPE = "";
+    GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType(), RD_BLCK, READ_TYPE);
 
     size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
     size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
@@ -127,19 +125,12 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context,
     grp_sz        = std::min(size_t(max_num_wg), grp_sz);
     size_t glb_sz = local_threads * grp_sz;
 
-    const std::vector<size_t> vld{local_threads, 1, 1};
-    const std::vector<size_t> vgd{glb_sz, 1, 1};
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{glb_sz, 1, 1};
 
-    KernelBuildParameters build_params =
-        KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}};
+    KernelBuildParameters build_params = KernelBuildParameters{};
 
-    switch(problem.GetTensorOp())
-    {
-    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
-    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
-    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
-    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
-    }
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_2D_TENSOR_SQUASH");
     build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
@@ -147,26 +138,28 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context,
 
     auto kernel = KernelInfo{};
 
-    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
     kernel.kernel_file  = "MIOpenTensorKernels.cl";
     kernel.kernel_name  = "Op2dTensorSquash";
 
-    for(uint32_t i = 0; i <= 2; i++)
-    {
-        kernel.l_wk.push_back(vld[i]);
-        kernel.g_wk.push_back(vgd[i]);
-    }
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [=](const std::vector<Kernel> kernels) {
+    result.invoker_factory = [total_work](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
+                const auto& blens    = params.bTensorDesc.GetLengths();
+                const auto& bstrides = params.bTensorDesc.GetStrides();
+
                 kernel(params.ATensor,
                        params.BTensor,
                        static_cast<int>(blens[1]),
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
new file mode 100644
index 0000000000..e2734ddd05
--- /dev/null
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -0,0 +1,186 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op3dTensorGeneric::IsApplicable(const ExecutionContext& context,
+                                     const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& alens       = aTensorDesc.GetLengths();
+    auto asize              = alens.size();
+
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
+    if(asize == 3)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t
+Op3dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
+                                    const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op3dTensorGeneric::GetSolution(const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    // first_not_one is incorrect if btensor size equal to 1
+    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+    auto d             = std::distance(blens.begin(), first_not_one.base());
+
+    // quick fix
+    int num_wg = first_not_one != blens.rend()
+                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
+                     : 1;
+
+    int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
+
+    unsigned int bitmap = 0;
+    // update bitmap for first_not_one
+    bitmap |= (1 << (blens.size() - d));
+
+    for(int i = (d - 2); i >= 0; i--)
+    {
+        if(blens[i] != 1)
+        {
+            bitmap |= (1 << (blens.size() - (i + 1)));
+            num_wg *= blens[i];
+        }
+        else
+        {
+            work_per_wg *= clens[i];
+        }
+    }
+
+    int num_wg_orig = num_wg;
+    int max_num_wg  = 4096;
+    num_wg          = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    size_t local_threads  = 256;
+    size_t global_threads = num_wg * local_threads;
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_3D_TENSOR_GENERIC");
+    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op3dTensorGeneric";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory = [bitmap, work_per_wg, num_wg_orig](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                const auto& blens = params.bTensorDesc.GetLengths();
+                const auto& clens = params.cTensorDesc.GetLengths();
+
+                const auto& astrides = params.aTensorDesc.GetStrides();
+                const auto& bstrides = params.bTensorDesc.GetStrides();
+                const auto& cstrides = params.cTensorDesc.GetStrides();
+
+                kernel(params.ATensor,
+                       static_cast<int>(astrides[0]),
+                       static_cast<int>(astrides[1]),
+                       params.BTensor,
+                       static_cast<int>(blens[1]),
+                       static_cast<int>(blens[2]),
+                       static_cast<int>(bstrides[0]),
+                       static_cast<int>(bstrides[1]),
+                       params.CTensor,
+                       static_cast<int>(clens[1]),
+                       static_cast<int>(clens[2]),
+                       static_cast<int>(cstrides[0]),
+                       static_cast<int>(cstrides[1]),
+                       miopen_alpha0,
+                       miopen_alpha1,
+                       miopen_beta,
+                       bitmap,
+                       work_per_wg,
+                       static_cast<int64_t>(params.Aoffset),
+                       static_cast<int64_t>(params.Boffset),
+                       static_cast<int64_t>(params.Coffset),
+                       static_cast<int>(num_wg_orig));
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp
new file mode 100644
index 0000000000..a9446472bf
--- /dev/null
+++ b/src/solver/tensorOp/tensor_op_helpers.hpp
@@ -0,0 +1,77 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/tensorOp/problem_description.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+inline void GetCommonParams(KernelBuildParameters& build_params,
+                            miopen::tensorOp::ProblemDescription problem,
+                            bool is64bSupported)
+{
+    build_params.Define("MIOPEN_TYPE", miopen::GetDataType(problem.GetBTensorDesc().GetType()));
+
+    switch(problem.GetTensorOp())
+    {
+    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
+    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
+    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
+    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
+    }
+
+    if(is64bSupported && problem.GetATensorDesc().AllDimsFitIntoInt())
+    {
+        build_params.Define("DIM_TYPE", "uint32_t");
+    }
+    else
+    {
+        build_params.Define("DIM_TYPE", "uint64_t");
+    }
+    // current workaround
+    build_params.Define("MIOPEN_USE_FP16", std::to_string(0));
+    build_params.Define("MIOPEN_USE_FP32", std::to_string(1));
+}
+
+inline void
+GetRDBLCKandREADTYPE(size_t len, miopenDataType_t type, size_t& RD_BLCK, std::string& READ_TYPE)
+{
+    RD_BLCK                     = (len % 4 == 0) ? 4 : (len % 2 == 0) ? 2 : 1;
+    const std::string data_type = GetDataType(type);
+    READ_TYPE                   = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 6c258c0e7d..20973d58be 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -925,7 +925,8 @@ void OpTensor2(Handle& handle,
     const auto solvers = solver::SolverContainer<solver::tensorOp::Op2dTensorLite>{} +
                          solver::SolverContainer<solver::tensorOp::Op2dTensorSquash>{} +
                          solver::SolverContainer<solver::tensorOp::Op1dTensorGeneric>{} +
-                         solver::SolverContainer<solver::tensorOp::Op2dTensorGeneric>{};
+                         solver::SolverContainer<solver::tensorOp::Op2dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensorOp::Op3dTensorGeneric>{};
     solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
 }
 
diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp
index c93d1b1eb1..5d33229cef 100644
--- a/test/tensor_ops.cpp
+++ b/test/tensor_ops.cpp
@@ -241,7 +241,7 @@ struct tensor_ops_driver : test_driver
 
     std::vector<std::vector<int>> get_sub_tensor_a()
     {
-        return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8},*/ {16, 8}, {8}};
+        return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, */ {20, 16, 8}, {1, 16, 8}, {16, 8}, {8}};
     }
 
     std::vector<std::vector<int>> get_sub_tensor_b()
@@ -258,12 +258,12 @@ struct tensor_ops_driver : test_driver
                 {1, 20, 16, 1},
                 {1, 20, 1, 1},
                 {1, 1, 16, 8},
-                {1, 1, 1, 8},
+                {1, 1, 1, 8}, */
                 {20, 16, 8},
                 {20, 16, 1},
                 {1, 16, 8},
                 {1, 16, 1},
-                {20, 1, 1},*/
+                {20, 1, 1},
                 {16, 8},
                 {16, 1},
                 {1, 8},

From cadb2649c1f29f8d916b365213421a3733c6f543 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Tue, 5 Nov 2024 19:10:16 +0200
Subject: [PATCH 07/25] initial switch to solver structure for all kernels,
 still need to separate some of them into unique solvers and tidy the code

---
 src/CMakeLists.txt                          |   5 +
 src/include/miopen/tensorOp/solvers.hpp     |  85 +++++++
 src/kernels/MIOpenTensorKernels.cl          |  18 --
 src/solver/tensorOp/Op2dTensorLite.cpp      |  18 +-
 src/solver/tensorOp/Op2dTensorSquash.cpp    |  18 +-
 src/solver/tensorOp/Op3dTensorGeneric.cpp   |  28 +--
 src/solver/tensorOp/Op4dTensorGeneric.cpp   | 161 ++++++++++++++
 src/solver/tensorOp/Op4dTensorLite.cpp      | 175 +++++++++++++++
 src/solver/tensorOp/Op5dTensorGeneric.cpp   | 174 +++++++++++++++
 src/solver/tensorOp/OpTensorFwdBias.cpp     | 214 ++++++++++++++++++
 src/solver/tensorOp/OpTensorLeadingOnes.cpp | 232 ++++++++++++++++++++
 src/solver/tensorOp/tensor_op_helpers.hpp   | 146 +++++++++++-
 src/tensor.cpp                              |  11 +-
 test/tensor_ops.cpp                         |   6 +-
 14 files changed, 1211 insertions(+), 80 deletions(-)
 create mode 100644 src/solver/tensorOp/Op4dTensorGeneric.cpp
 create mode 100644 src/solver/tensorOp/Op4dTensorLite.cpp
 create mode 100644 src/solver/tensorOp/Op5dTensorGeneric.cpp
 create mode 100644 src/solver/tensorOp/OpTensorFwdBias.cpp
 create mode 100644 src/solver/tensorOp/OpTensorLeadingOnes.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 85912d3b6f..4f1096001a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -342,6 +342,11 @@ set( MIOpen_Source
     solver/tensorOp/Op2dTensorLite.cpp
     solver/tensorOp/Op2dTensorSquash.cpp
     solver/tensorOp/Op3dTensorGeneric.cpp
+    solver/tensorOp/OpTensorFwdBias.cpp
+    solver/tensorOp/Op4dTensorLite.cpp
+    solver/tensorOp/OpTensorLeadingOnes.cpp
+    solver/tensorOp/Op4dTensorGeneric.cpp
+    solver/tensorOp/Op5dTensorGeneric.cpp
     subbuffers.cpp
     t5layernorm_api.cpp
     target_properties.cpp
diff --git a/src/include/miopen/tensorOp/solvers.hpp b/src/include/miopen/tensorOp/solvers.hpp
index 9eb9e187b6..635d0ab777 100644
--- a/src/include/miopen/tensorOp/solvers.hpp
+++ b/src/include/miopen/tensorOp/solvers.hpp
@@ -124,6 +124,91 @@ struct Op3dTensorGeneric final : TensorOpSolver
     bool MayNeedWorkspace() const override { return false; }
 };
 
+struct OpTensorFwdBias final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<OpTensorFwdBias>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op4dTensorLite final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op4dTensorLite>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct OpTensorLeadingOnes final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<OpTensorLeadingOnes>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op4dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op4dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op5dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op5dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
 } // namespace tensorOp
 
 } // namespace solver
diff --git a/src/kernels/MIOpenTensorKernels.cl b/src/kernels/MIOpenTensorKernels.cl
index da998696ac..3c21267e57 100644
--- a/src/kernels/MIOpenTensorKernels.cl
+++ b/src/kernels/MIOpenTensorKernels.cl
@@ -24,24 +24,6 @@
  *
  *******************************************************************************/
 
-#if MIOPEN_USE_FP16 == 1
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define _FLOAT half
-#ifndef HALF_MAX
-#define MAX_VAL 65504 /* max value */
-#else
-#define MAX_VAL HALF_MAX
-#endif
-#endif
-#if MIOPEN_USE_FP32 == 1
-#define _FLOAT float
-#ifndef FLT_MAX
-#define MAX_VAL 3.402823466e+38F /* max value */
-#else
-#define MAX_VAL FLT_MAX
-#endif
-#endif
-
 /* Only works for NCHW
  * bitmap tracks which dims are the same between 'a' and 'c'.
  * Example: 0, 1, 1, 0 means that C and H dims are the same and the rest are ones
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
index 8782fe1b29..6dedb553e5 100644
--- a/src/solver/tensorOp/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -101,22 +101,12 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
-    // first_not_one is incorrect if btensor size equal to 1
-    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
-    auto d             = std::distance(blens.begin(), first_not_one.base());
+    int num_wg          = 0;
+    int work_per_wg     = 0;
+    unsigned int bitmap = 0;
 
-    // quick fix
-    int num_wg = first_not_one != blens.rend()
-                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
-                     : 1;
+    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
 
-    for(int i = (d - 2); i >= 0; i--)
-    {
-        if(blens[i] != 1)
-        {
-            num_wg *= blens[i];
-        }
-    }
     int max_num_wg = 4096;
     num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
 
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
index 0368592cf7..2cb95d1088 100644
--- a/src/solver/tensorOp/Op2dTensorSquash.cpp
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -93,22 +93,12 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context,
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
-    // first_not_one is incorrect if btensor size equal to 1
-    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
-    auto d             = std::distance(blens.begin(), first_not_one.base());
+    int num_wg          = 0;
+    int work_per_wg     = 0;
+    unsigned int bitmap = 0;
 
-    // quick fix
-    int num_wg = first_not_one != blens.rend()
-                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
-                     : 1;
+    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
 
-    for(int i = (d - 2); i >= 0; i--)
-    {
-        if(blens[i] != 1)
-        {
-            num_wg *= blens[i];
-        }
-    }
     int max_num_wg = 4096;
     num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
 
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index e2734ddd05..252c8f937b 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -76,33 +76,11 @@ Op3dTensorGeneric::GetSolution(const ExecutionContext& context,
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
-    // first_not_one is incorrect if btensor size equal to 1
-    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
-    auto d             = std::distance(blens.begin(), first_not_one.base());
-
-    // quick fix
-    int num_wg = first_not_one != blens.rend()
-                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
-                     : 1;
-
-    int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
-
+    int num_wg          = 0;
+    int work_per_wg     = 0;
     unsigned int bitmap = 0;
-    // update bitmap for first_not_one
-    bitmap |= (1 << (blens.size() - d));
 
-    for(int i = (d - 2); i >= 0; i--)
-    {
-        if(blens[i] != 1)
-        {
-            bitmap |= (1 << (blens.size() - (i + 1)));
-            num_wg *= blens[i];
-        }
-        else
-        {
-            work_per_wg *= clens[i];
-        }
-    }
+    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
 
     int num_wg_orig = num_wg;
     int max_num_wg  = 4096;
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
new file mode 100644
index 0000000000..fb50c37a69
--- /dev/null
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -0,0 +1,161 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op4dTensorGeneric::IsApplicable(const ExecutionContext& context,
+                                     const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& alens       = aTensorDesc.GetLengths();
+    auto asize              = alens.size();
+
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
+    if(asize == 4)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t
+Op4dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
+                                    const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op4dTensorGeneric::GetSolution(const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    int max_num_wg      = 4096;
+    int num_wg_orig     = 0;
+    int work_per_wg     = 0;
+    int incr_wg         = 0;
+    unsigned int bitmap = 0;
+
+    size_t local_threads  = 0;
+    size_t global_threads = 0;
+
+    Get4dParams(
+        problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_4D_TENSOR_GENERIC");
+    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op4dTensorGeneric";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory = [work_per_wg, num_wg_orig, bitmap](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                const auto& blens = params.bTensorDesc.GetLengths();
+                const auto& clens = params.cTensorDesc.GetLengths();
+
+                const auto& astrides = params.aTensorDesc.GetStrides();
+                const auto& bstrides = params.bTensorDesc.GetStrides();
+                const auto& cstrides = params.cTensorDesc.GetStrides();
+
+                kernel(params.ATensor,
+                       static_cast<int>(astrides[0]), // a_nstride,
+                       static_cast<int>(astrides[1]), // a_cstride,
+                       static_cast<int>(astrides[2]), // a_hstride,
+                       params.BTensor,
+                       static_cast<int>(blens[1]),    // b_c,
+                       static_cast<int>(blens[2]),    // b_h,
+                       static_cast<int>(blens[3]),    // b_w,
+                       static_cast<int>(bstrides[0]), // b_nstride,
+                       static_cast<int>(bstrides[1]), // b_cstride,
+                       static_cast<int>(bstrides[2]), // b_hstride,
+                       params.CTensor,
+                       static_cast<int>(clens[1]),    // c_c,
+                       static_cast<int>(clens[2]),    // c_h,
+                       static_cast<int>(clens[3]),    // c_w,
+                       static_cast<int>(cstrides[0]), // c_nstride,
+                       static_cast<int>(cstrides[1]), // c_cstride,
+                       static_cast<int>(cstrides[2]), // c_hstride,
+                       miopen_alpha0,
+                       miopen_alpha1,
+                       miopen_beta,
+                       bitmap,
+                       work_per_wg,
+                       static_cast<int64_t>(params.Aoffset),
+                       static_cast<int64_t>(params.Boffset),
+                       static_cast<int64_t>(params.Coffset),
+                       static_cast<int>(num_wg_orig));
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp
new file mode 100644
index 0000000000..d0431cb1f9
--- /dev/null
+++ b/src/solver/tensorOp/Op4dTensorLite.cpp
@@ -0,0 +1,175 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op4dTensorLite::IsApplicable(const ExecutionContext& context,
+                                  const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    int num_wg          = 0;
+    int work_per_wg     = 0;
+    unsigned int bitmap = 0;
+
+    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
+
+    // quick fix for btensor = <1, 1, 1, 1>
+    if(bTensorDesc.GetElementSize() == 1)
+        bitmap = 4;
+
+    auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+
+    bool packed_tensor = true;
+
+    // auto alens = aTensorDesc.GetLengths();
+    packed_tensor &= aTensorDesc.IsPacked();
+    packed_tensor &= bTensorDesc.IsPacked();
+    packed_tensor &= cTensorDesc.IsPacked();
+
+    bool packed_equal_tensor =
+        packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
+
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
+    if(asize == 4 && fwd_conv_bias == 0 && packed_equal_tensor)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t
+Op4dTensorLite::GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution Op4dTensorLite::GetSolution(const ExecutionContext& context,
+                                         const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    int num_wg_orig     = 0;
+    int work_per_wg     = 0;
+    int incr_wg         = 0;
+    unsigned int bitmap = 0;
+
+    size_t local_threads  = 0;
+    size_t global_threads = 0;
+
+    Get4dParams(
+        problem, true, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+
+    size_t RD_BLCK        = size_t(1);
+    std::string READ_TYPE = "";
+    GetRDBLCKandREADTYPE(cTensorDesc.GetElementSize(), bTensorDesc.GetType(), RD_BLCK, READ_TYPE);
+
+    size_t total_work = std::max(cTensorDesc.GetElementSize() / RD_BLCK, size_t(1));
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_4D_TENSOR_LITE");
+    build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
+    build_params.Define("READ_TYPE", READ_TYPE);
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op4dTensorLite";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory = [total_work](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                kernel(params.ATensor,
+                       params.BTensor,
+                       params.CTensor,
+                       miopen_alpha0,
+                       miopen_alpha1,
+                       miopen_beta,
+                       static_cast<int64_t>(params.Aoffset),
+                       static_cast<int64_t>(params.Boffset),
+                       static_cast<int64_t>(params.Coffset),
+                       static_cast<int64_t>(total_work),
+                       static_cast<int>(!float_equal(miopen_beta, 0.0)));
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp
new file mode 100644
index 0000000000..1756111326
--- /dev/null
+++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp
@@ -0,0 +1,174 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context,
+                                     const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& alens       = aTensorDesc.GetLengths();
+    auto asize              = alens.size();
+
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
+    if(asize == 5)
+    {
+        return true;
+    }
+    return false;
+}
+
+std::size_t
+Op5dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
+                                    const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op5dTensorGeneric::GetSolution(const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    int num_wg          = 0;
+    int work_per_wg     = 0;
+    unsigned int bitmap = 0;
+
+    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
+
+    int num_wg_orig = num_wg;
+    int max_num_wg  = 4096;
+    num_wg          = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    size_t local_threads  = 256;
+    size_t global_threads = num_wg * local_threads;
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_5D_TENSOR_GENERIC");
+    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op5dTensorGeneric";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory = [bitmap, work_per_wg, num_wg_orig](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                const auto& blens = params.bTensorDesc.GetLengths();
+                const auto& clens = params.cTensorDesc.GetLengths();
+
+                const auto& astrides = params.aTensorDesc.GetStrides();
+                const auto& bstrides = params.bTensorDesc.GetStrides();
+                const auto& cstrides = params.cTensorDesc.GetStrides();
+
+                kernel(params.ATensor,
+                       static_cast<int>(astrides[0]),
+                       static_cast<int>(astrides[1]),
+                       static_cast<int>(astrides[2]),
+                       static_cast<int>(astrides[3]),
+                       params.BTensor,
+                       static_cast<int>(blens[1]),    // b_c,
+                       static_cast<int>(blens[2]),    // b_d,
+                       static_cast<int>(blens[3]),    // b_h,
+                       static_cast<int>(blens[4]),    // b_w,
+                       static_cast<int>(bstrides[0]), // b_nstride,
+                       static_cast<int>(bstrides[1]), // b_cstride,
+                       static_cast<int>(bstrides[2]), // b_dstride,
+                       static_cast<int>(bstrides[3]), // b_hstride,
+                       params.CTensor,
+                       static_cast<int>(clens[1]),    // c_c,
+                       static_cast<int>(clens[2]),    // c_d,
+                       static_cast<int>(clens[3]),    // c_h,
+                       static_cast<int>(clens[4]),    // c_w,
+                       static_cast<int>(cstrides[0]), // c_nstride,
+                       static_cast<int>(cstrides[1]), // c_cstride,
+                       static_cast<int>(cstrides[2]), // c_dstride,
+                       static_cast<int>(cstrides[3]), // c_hstride,
+                       miopen_alpha0,
+                       miopen_alpha1,
+                       miopen_beta,
+                       bitmap,
+                       work_per_wg,
+                       static_cast<int64_t>(params.Aoffset),
+                       static_cast<int64_t>(params.Boffset),
+                       static_cast<int64_t>(params.Coffset),
+                       static_cast<int>(num_wg_orig));
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
new file mode 100644
index 0000000000..4f304d9fdb
--- /dev/null
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -0,0 +1,214 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context,
+                                   const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    int num_wg          = 0;
+    int work_per_wg     = 0;
+    unsigned int bitmap = 0;
+
+    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
+
+    // quick fix for btensor = <1, 1, 1, 1>
+    if(bTensorDesc.GetElementSize() == 1)
+        bitmap = 4;
+
+    auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
+    if(asize == 4 && fwd_conv_bias != 0)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t
+OpTensorFwdBias::GetWorkspaceSize(const ExecutionContext& context,
+                                  const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context,
+                                          const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    int max_num_wg      = 4096;
+    int num_wg_orig     = 0;
+    int work_per_wg     = 0;
+    int incr_wg         = 0;
+    unsigned int bitmap = 0;
+
+    size_t local_threads  = 0;
+    size_t global_threads = 0;
+
+    Get4dParams(
+        problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    bool packed_tensor = true;
+    packed_tensor &= aTensorDesc.IsPacked();
+    packed_tensor &= bTensorDesc.IsPacked();
+    packed_tensor &= cTensorDesc.IsPacked();
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    if(packed_tensor)
+    {
+        build_params.Define("USE_FWD_BIAS");
+        kernel.kernel_name = "OpTensorFwdBias";
+    }
+    else
+    {
+        build_params.Define("USE_FWD_BIAS_GENERIC");
+        kernel.kernel_name = "OpTensorFwdBiasGeneric";
+    }
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory =
+        [work_per_wg, num_wg_orig, incr_wg, packed_tensor](const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    const auto& blens = params.bTensorDesc.GetLengths();
+                    const auto& clens = params.cTensorDesc.GetLengths();
+
+                    const auto& astrides = params.aTensorDesc.GetStrides();
+                    const auto& bstrides = params.bTensorDesc.GetStrides();
+                    const auto& cstrides = params.cTensorDesc.GetStrides();
+
+                    if(packed_tensor)
+                    {
+                        kernel(params.ATensor,
+                               params.BTensor,
+                               static_cast<int>(blens[1]),
+                               params.CTensor,
+                               static_cast<int>(clens[0]),
+                               static_cast<int>(cstrides[0]),
+                               static_cast<int>(cstrides[1]),
+                               work_per_wg,
+                               miopen_alpha0,
+                               miopen_alpha1,
+                               miopen_beta,
+                               static_cast<int64_t>(params.Aoffset),
+                               static_cast<int64_t>(params.Boffset),
+                               static_cast<int64_t>(params.Coffset),
+                               static_cast<int>(num_wg_orig),
+                               static_cast<int>(incr_wg));
+                    }
+                    else
+                    {
+                        kernel(params.ATensor,
+                               static_cast<int>(astrides[0]),
+                               static_cast<int>(astrides[1]),
+                               static_cast<int>(astrides[2]),
+                               params.BTensor,
+                               static_cast<int>(blens[1]),
+                               static_cast<int>(bstrides[1]),
+                               params.CTensor,
+                               static_cast<int>(clens[0]),
+                               static_cast<int>(clens[3]),
+                               static_cast<int>(cstrides[0]),
+                               static_cast<int>(cstrides[1]),
+                               static_cast<int>(cstrides[2]),
+                               miopen_alpha0,
+                               miopen_alpha1,
+                               miopen_beta,
+                               work_per_wg,
+                               static_cast<int64_t>(params.Aoffset),
+                               static_cast<int64_t>(params.Boffset),
+                               static_cast<int64_t>(params.Coffset),
+                               static_cast<int>(num_wg_orig),
+                               static_cast<int>(incr_wg));
+                    }
+                });
+            };
+        };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
new file mode 100644
index 0000000000..77b67e9a76
--- /dev/null
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -0,0 +1,232 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context,
+                                       const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    int num_wg          = 0;
+    int work_per_wg     = 0;
+    unsigned int bitmap = 0;
+
+    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
+
+    // quick fix for btensor = <1, 1, 1, 1>
+    if(bTensorDesc.GetElementSize() == 1)
+        bitmap = 4;
+
+    auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+
+    bool packed_tensor = true;
+    packed_tensor &= aTensorDesc.IsPacked();
+    packed_tensor &= bTensorDesc.IsPacked();
+    packed_tensor &= cTensorDesc.IsPacked();
+
+    bool packed_equal_tensor =
+        packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
+
+    bool leading_ones = true;
+    // first_not_one is incorrect if btensor size equal to 1
+    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+    auto d             = std::distance(blens.begin(), first_not_one.base());
+
+    IsBitmapLeadingOnes(bitmap, clens.size(), static_cast<int>(d - 2), leading_ones);
+
+    if(GetDataType(aTensorDesc.GetType()) == "double")
+    {
+        return false;
+    }
+
+    if(asize == 4 && fwd_conv_bias == 0 && !packed_equal_tensor && leading_ones)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t
+OpTensorLeadingOnes::GetWorkspaceSize(const ExecutionContext& context,
+                                      const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+OpTensorLeadingOnes::GetSolution(const ExecutionContext& context,
+                                 const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    int max_num_wg      = 4096;
+    int num_wg_orig     = 0;
+    int work_per_wg     = 0;
+    int incr_wg         = 0;
+    unsigned int bitmap = 0;
+
+    size_t local_threads  = 0;
+    size_t global_threads = 0;
+
+    Get4dParams(
+        problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    bool packed_tensor = true;
+    packed_tensor &= aTensorDesc.IsPacked();
+    packed_tensor &= bTensorDesc.IsPacked();
+    packed_tensor &= cTensorDesc.IsPacked();
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    if(packed_tensor)
+    {
+        build_params.Define("USE_LEADING_ONES");
+        kernel.kernel_name = "OpTensorLeadingOnes";
+    }
+    else
+    {
+        build_params.Define("USE_LEADING_ONES_GENERIC");
+        kernel.kernel_name = "OpTensorLeadingOnesGeneric";
+    }
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory =
+        [work_per_wg, num_wg_orig, bitmap, packed_tensor](const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    const auto& clens = params.cTensorDesc.GetLengths();
+
+                    const auto& astrides = params.aTensorDesc.GetStrides();
+                    const auto& bstrides = params.bTensorDesc.GetStrides();
+                    const auto& cstrides = params.cTensorDesc.GetStrides();
+
+                    if(packed_tensor)
+                    {
+                        kernel(params.ATensor,
+                               params.BTensor,
+                               params.CTensor,
+                               static_cast<int>(clens[1]),
+                               static_cast<int>(clens[2]),
+                               static_cast<int>(clens[3]),
+                               static_cast<int>(cstrides[0]),
+                               static_cast<int>(cstrides[1]),
+                               work_per_wg,
+                               miopen_alpha0,
+                               miopen_alpha1,
+                               miopen_beta,
+                               static_cast<int64_t>(params.Aoffset),
+                               static_cast<int64_t>(params.Boffset),
+                               static_cast<int64_t>(params.Coffset),
+                               static_cast<int>(num_wg_orig),
+                               bitmap);
+                    }
+                    else
+                    {
+                        kernel(params.ATensor,
+                               static_cast<int>(astrides[0]),
+                               static_cast<int>(astrides[1]),
+                               static_cast<int>(astrides[2]),
+                               params.BTensor,
+                               static_cast<int>(bstrides[0]),
+                               static_cast<int>(bstrides[1]),
+                               static_cast<int>(bstrides[2]),
+                               params.CTensor,
+                               static_cast<int>(clens[1]),
+                               static_cast<int>(clens[2]),
+                               static_cast<int>(clens[3]),
+                               static_cast<int>(cstrides[0]),
+                               static_cast<int>(cstrides[1]),
+                               static_cast<int>(cstrides[2]),
+                               miopen_alpha0,
+                               miopen_alpha1,
+                               miopen_beta,
+                               work_per_wg,
+                               static_cast<int64_t>(params.Aoffset),
+                               static_cast<int64_t>(params.Boffset),
+                               static_cast<int64_t>(params.Coffset),
+                               static_cast<int>(num_wg_orig),
+                               bitmap);
+                    }
+                });
+            };
+        };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp
index a9446472bf..02b1a137d0 100644
--- a/src/solver/tensorOp/tensor_op_helpers.hpp
+++ b/src/solver/tensorOp/tensor_op_helpers.hpp
@@ -57,9 +57,6 @@ inline void GetCommonParams(KernelBuildParameters& build_params,
     {
         build_params.Define("DIM_TYPE", "uint64_t");
     }
-    // current workaround
-    build_params.Define("MIOPEN_USE_FP16", std::to_string(0));
-    build_params.Define("MIOPEN_USE_FP32", std::to_string(1));
 }
 
 inline void
@@ -70,6 +67,149 @@ GetRDBLCKandREADTYPE(size_t len, miopenDataType_t type, size_t& RD_BLCK, std::st
     READ_TYPE                   = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
 }
 
+inline void GetBitmapAndWgInfo(const std::vector<size_t>& blens,
+                               const std::vector<size_t>& clens,
+                               int& num_wg,
+                               int& work_per_wg,
+                               unsigned int& bitmap)
+{
+    // first_not_one is incorrect if btensor size equal to 1
+    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+    auto d             = std::distance(blens.begin(), first_not_one.base());
+
+    // quick fix
+    num_wg = first_not_one != blens.rend()
+                 ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
+                 : 1;
+
+    work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
+
+    // update bitmap for first_not_one
+    bitmap |= (1 << (blens.size() - d));
+
+    for(int i = (d - 2); i >= 0; i--)
+    {
+        if(blens[i] != 1)
+        {
+            bitmap |= (1 << (blens.size() - (i + 1)));
+            num_wg *= blens[i];
+        }
+        else
+        {
+            work_per_wg *= clens[i];
+        }
+    }
+}
+
+inline void
+IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_one, bool& leading_ones)
+{
+    for(int i = first_not_one; i >= 0; i--)
+    {
+        bool is_one = (bitmap & (1 << (n_size - 1 - i))) != 0u;
+        leading_ones &= is_one;
+    }
+}
+
+inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem,
+                        bool is4dLite,
+                        int& num_wg_orig,
+                        int& work_per_wg,
+                        int& incr_wg,
+                        unsigned int& bitmap,
+                        size_t& local_threads,
+                        size_t& global_threads)
+{
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto dims = clens.size();
+
+    // first_not_one is incorrect if btensor size equal to 1
+    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+    auto d             = std::distance(blens.begin(), first_not_one.base());
+
+    // quick fix
+    int num_wg = first_not_one != blens.rend()
+                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
+                     : 1;
+
+    work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
+
+    // update bitmap for first_not_one
+    bitmap |= (1 << (blens.size() - d));
+
+    for(int i = (d - 2); i >= 0; i--)
+    {
+        if(blens[i] != 1)
+        {
+            bitmap |= (1 << (blens.size() - (i + 1)));
+            num_wg *= blens[i];
+        }
+        else
+        {
+            work_per_wg *= clens[i];
+        }
+    }
+
+    // quick fix for btensor = <1, 1, 1, 1>
+    if(bTensorDesc.GetElementSize() == 1)
+        bitmap = 4;
+
+    // Forward Convolution Bias specialization
+    // for fwd-bias, bitmap looks like <0, 1, 0, 0>
+    // Is the no. of work-groups and the work for each wg balanced?
+    auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+    // This block gives off indexing for 5d tensors, skipping
+    if(fwd_conv_bias == 1 && dims < 5 && num_wg < 640 && work_per_wg > 256 && clens[0] > 0)
+    { // 640 workgroups of size 256 needed to completely fill the GPU
+
+        work_per_wg /= clens[0]; // c_n;
+        num_wg *= clens[0];      // c_n;
+        incr_wg = 1;
+    }
+
+    num_wg_orig    = num_wg;
+    int max_num_wg = 4096;
+    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    local_threads = 256;
+
+    bool leading_ones = true;
+    IsBitmapLeadingOnes(bitmap, clens.size(), static_cast<int>(d - 2), leading_ones);
+
+    if(leading_ones && work_per_wg < 64)
+    {
+        local_threads = 64;
+    }
+
+    // Special case for adding tensors in place
+    global_threads =
+        (static_cast<int>(leading_ones) == 1 && (d - 1) == 3) ? num_wg : num_wg * local_threads;
+    global_threads = (global_threads < local_threads) ? local_threads : global_threads;
+
+    if(is4dLite)
+    {
+        // for naive tensor ops
+        const std::string data_type = GetDataType(bTensorDesc.GetType());
+
+        size_t TENS_LEN = cTensorDesc.GetElementSize();
+        size_t RD_BLCK  = (TENS_LEN % 4 == 0) ? 4 : (TENS_LEN % 2 == 0) ? 2 : 1;
+        const std::string READ_TYPE =
+            (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
+
+        size_t total_work = std::max(TENS_LEN / RD_BLCK, size_t(1));
+        size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
+        grp_sz            = std::min(size_t(max_num_wg), grp_sz);
+        size_t glb_sz     = local_threads * grp_sz;
+
+        global_threads = glb_sz;
+    }
+}
+
 } // namespace tensorOp
 
 } // namespace solver
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 20973d58be..f65a1a408e 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -922,11 +922,16 @@ void OpTensor2(Handle& handle,
                                                       nonStandardSquash};
 
     const auto algo    = AlgorithmName{"TensorOpSolver"};
-    const auto solvers = solver::SolverContainer<solver::tensorOp::Op2dTensorLite>{} +
+    const auto solvers = solver::SolverContainer<solver::tensorOp::OpTensorFwdBias>{} +
+                         solver::SolverContainer<solver::tensorOp::Op4dTensorLite>{} +
+                         solver::SolverContainer<solver::tensorOp::OpTensorLeadingOnes>{} +
+                         solver::SolverContainer<solver::tensorOp::Op2dTensorLite>{} +
                          solver::SolverContainer<solver::tensorOp::Op2dTensorSquash>{} +
-                         solver::SolverContainer<solver::tensorOp::Op1dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensorOp::Op5dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensorOp::Op4dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensorOp::Op3dTensorGeneric>{} +
                          solver::SolverContainer<solver::tensorOp::Op2dTensorGeneric>{} +
-                         solver::SolverContainer<solver::tensorOp::Op3dTensorGeneric>{};
+                         solver::SolverContainer<solver::tensorOp::Op1dTensorGeneric>{};
     solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
 }
 
diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp
index 5d33229cef..ee42a94ea6 100644
--- a/test/tensor_ops.cpp
+++ b/test/tensor_ops.cpp
@@ -241,12 +241,12 @@ struct tensor_ops_driver : test_driver
 
     std::vector<std::vector<int>> get_sub_tensor_a()
     {
-        return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, */ {20, 16, 8}, {1, 16, 8}, {16, 8}, {8}};
+        return {{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8}, {16, 8}, {8}};
     }
 
     std::vector<std::vector<int>> get_sub_tensor_b()
     {
-        return {/*{32, 16, 8, 4, 4},
+        return {{32, 16, 8, 4, 4},
                 {32, 16, 1, 1, 1},
                 {1, 16, 8, 1, 1},
                 {1, 1, 8, 4, 1},
@@ -258,7 +258,7 @@ struct tensor_ops_driver : test_driver
                 {1, 20, 16, 1},
                 {1, 20, 1, 1},
                 {1, 1, 16, 8},
-                {1, 1, 1, 8}, */
+                {1, 1, 1, 8}, 
                 {20, 16, 8},
                 {20, 16, 1},
                 {1, 16, 8},

From 63603f02fbd03243b088f04d8de261cd40a61a75 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Wed, 6 Nov 2024 14:03:58 +0200
Subject: [PATCH 08/25] fix for two kernels in one solver

---
 src/solver/tensorOp/OpTensorFwdBias.cpp     | 5 +++--
 src/solver/tensorOp/OpTensorLeadingOnes.cpp | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index 4f304d9fdb..c2aa07c8ae 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -119,8 +119,6 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context,
 
     auto kernel = KernelInfo{};
 
-    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
-    kernel.kernel_file  = "MIOpenTensorKernels.cl";
     if(packed_tensor)
     {
         build_params.Define("USE_FWD_BIAS");
@@ -132,6 +130,9 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context,
         kernel.kernel_name = "OpTensorFwdBiasGeneric";
     }
 
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+
     using std::begin, std::end;
 
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index 77b67e9a76..8f4a4399ce 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -132,11 +132,8 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context,
     GetCommonParams(build_params, problem, false);
 
     build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
-
     auto kernel = KernelInfo{};
 
-    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
-    kernel.kernel_file  = "MIOpenTensorKernels.cl";
     if(packed_tensor)
     {
         build_params.Define("USE_LEADING_ONES");
@@ -148,6 +145,9 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context,
         kernel.kernel_name = "OpTensorLeadingOnesGeneric";
     }
 
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+
     using std::begin, std::end;
 
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));

From 976bd84f1234ff7c1906db3b85e4f60b37346769 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Thu, 7 Nov 2024 15:29:44 +0200
Subject: [PATCH 09/25] additional changes

---
 src/solver/tensorOp/Op1dTensorGeneric.cpp   | 17 +-----
 src/solver/tensorOp/Op2dTensorGeneric.cpp   |  7 +--
 src/solver/tensorOp/Op2dTensorLite.cpp      | 44 ++++++--------
 src/solver/tensorOp/Op2dTensorSquash.cpp    | 33 +++++------
 src/solver/tensorOp/Op3dTensorGeneric.cpp   |  8 +--
 src/solver/tensorOp/Op4dTensorGeneric.cpp   | 13 +----
 src/solver/tensorOp/Op4dTensorLite.cpp      | 62 ++++++++------------
 src/solver/tensorOp/Op5dTensorGeneric.cpp   |  9 +--
 src/solver/tensorOp/OpTensorFwdBias.cpp     | 41 ++++++--------
 src/solver/tensorOp/OpTensorLeadingOnes.cpp | 63 +++++++++------------
 src/solver/tensorOp/tensor_op_helpers.hpp   | 60 ++++++++++----------
 src/tensorOp/problem_description.cpp        | 43 +++++++-------
 12 files changed, 162 insertions(+), 238 deletions(-)

diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index 640f9968e2..a149f488e2 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -42,12 +42,10 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
-    // const auto& bTensorDesc = problem.GetBTensorDesc();
-    const auto& alens = aTensorDesc.GetLengths();
-    // const auto& blens       = bTensorDesc.GetLengths();
+    const auto& alens       = aTensorDesc.GetLengths();
     auto asize       = alens.size();
 
-    if(GetDataType(aTensorDesc.GetType()) == "double")
+    if(aTensorDesc.GetType() == miopenDouble)
     {
         return false;
     }
@@ -56,16 +54,7 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
     {
         return true;
     }
-    // add support for this later
-    // if(asize == 2 && ((blens[0] == 1 && blens[1] == 1) || (blens[0] > 1 && blens[1] > 1)))
-    // {
-    //     return true;
-    // }
-    // if(asize == 3 && ((blens[0] == 1 && blens[1] == 1 && blens[2] == 1) ||
-    //                   (blens[0] > 1 && blens[1] > 1 && blens[2] > 1)))
-    // {
-    //     return true;
-    // }
+
     return false;
 }
 
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index 92a19d6a99..3f4b03c2d1 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -41,12 +41,10 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
-    // const auto& bTensorDesc = problem.GetBTensorDesc();
-    const auto& alens = aTensorDesc.GetLengths();
-    // const auto& blens       = bTensorDesc.GetLengths();
+    const auto& alens       = aTensorDesc.GetLengths();
     auto asize       = alens.size();
 
-    if(GetDataType(aTensorDesc.GetType()) == "double")
+    if(aTensorDesc.GetType() == miopenDouble)
     {
         return false;
     }
@@ -55,7 +53,6 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context,
     {
         return true;
     }
-    // add applicable when asize == 3 and some special cases for b dimensions
 
     return false;
 }
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
index 6dedb553e5..d76c4f57f6 100644
--- a/src/solver/tensorOp/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -51,33 +51,31 @@ bool Op2dTensorLite::IsApplicable(const ExecutionContext& context,
 
     auto asize = alens.size();
 
-    if(GetDataType(aTensorDesc.GetType()) == "double")
+    if(aTensorDesc.GetType() == miopenDouble)
     {
         return false;
     }
 
-    if(asize < 3)
+    if(asize == 3)
     {
-        return false;
-    }
+        size_t local_threads = 256;
+        int max_num_wg       = 4096;
 
-    size_t local_threads = 256;
-    int max_num_wg       = 4096;
+        // for naive tensor ops
+        size_t RD_BLCK    = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
+        size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
+        size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
 
-    // for naive tensor ops
-    size_t RD_BLCK    = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
-    size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
-    size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
-
-    // opencl kernels are no longer supported, fallback to generic case
-    bool lite_applicable = grp_sz <= size_t(max_num_wg);
+        // opencl kernels are no longer supported, fallback to generic case
+        bool lite_applicable = grp_sz <= size_t(max_num_wg);
 
-    bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
-                   (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
+        bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
+                       (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
 
-    if(asize == 3 && lite_applicable && is_lite)
-    {
-        return true;
+        if(lite_applicable && is_lite)
+        {
+            return true;
+        }
     }
 
     return false;
@@ -101,11 +99,7 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
-    int num_wg          = 0;
-    int work_per_wg     = 0;
-    unsigned int bitmap = 0;
-
-    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
+    auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
     int max_num_wg = 4096;
     num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
@@ -113,9 +107,7 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
     size_t local_threads = 256;
 
     // for naive tensor ops
-    size_t RD_BLCK        = size_t(1);
-    std::string READ_TYPE = "";
-    GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType(), RD_BLCK, READ_TYPE);
+    auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType());
 
     size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
     size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
index 2cb95d1088..93f2868905 100644
--- a/src/solver/tensorOp/Op2dTensorSquash.cpp
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -50,25 +50,24 @@ bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context,
 
     auto asize = alens.size();
 
-    if(GetDataType(aTensorDesc.GetType()) == "double")
+    if(aTensorDesc.GetType() == miopenDouble)
     {
         return false;
     }
 
-    if(asize < 3)
+    if(asize == 3)
     {
-        return false;
-    }
-
-    bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
-                   (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
+        bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
+                       (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
 
-    bool is_squashed = problem.GetNonStandardSquash() && !is_lite &&
-                       (blens[0] == 1 && clens[0] == 1 && clens[1] == 1 && blens[2] == clens[2]);
+        bool is_squashed =
+            problem.GetNonStandardSquash() && !is_lite &&
+            (blens[0] == 1 && clens[0] == 1 && clens[1] == 1 && blens[2] == clens[2]);
 
-    if(asize == 3 && is_squashed)
-    {
-        return true;
+        if(is_squashed)
+        {
+            return true;
+        }
     }
 
     return false;
@@ -93,11 +92,7 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context,
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
-    int num_wg          = 0;
-    int work_per_wg     = 0;
-    unsigned int bitmap = 0;
-
-    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
+    auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
     int max_num_wg = 4096;
     num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
@@ -105,9 +100,7 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context,
     size_t local_threads = 256;
 
     // for naive tensor ops
-    size_t RD_BLCK        = size_t(1);
-    std::string READ_TYPE = "";
-    GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType(), RD_BLCK, READ_TYPE);
+    auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType());
 
     size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
     size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index 252c8f937b..1aeb83509d 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -44,7 +44,7 @@ bool Op3dTensorGeneric::IsApplicable(const ExecutionContext& context,
     const auto& alens       = aTensorDesc.GetLengths();
     auto asize              = alens.size();
 
-    if(GetDataType(aTensorDesc.GetType()) == "double")
+    if(aTensorDesc.GetType() == miopenDouble)
     {
         return false;
     }
@@ -76,11 +76,7 @@ Op3dTensorGeneric::GetSolution(const ExecutionContext& context,
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
-    int num_wg          = 0;
-    int work_per_wg     = 0;
-    unsigned int bitmap = 0;
-
-    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
+    auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
     int num_wg_orig = num_wg;
     int max_num_wg  = 4096;
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
index fb50c37a69..ad17bf8791 100644
--- a/src/solver/tensorOp/Op4dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -44,7 +44,7 @@ bool Op4dTensorGeneric::IsApplicable(const ExecutionContext& context,
     const auto& alens       = aTensorDesc.GetLengths();
     auto asize              = alens.size();
 
-    if(GetDataType(aTensorDesc.GetType()) == "double")
+    if(aTensorDesc.GetType() == miopenDouble)
     {
         return false;
     }
@@ -71,16 +71,9 @@ Op4dTensorGeneric::GetSolution(const ExecutionContext& context,
     auto result = ConvSolution{miopenStatusSuccess};
 
     int max_num_wg      = 4096;
-    int num_wg_orig     = 0;
-    int work_per_wg     = 0;
-    int incr_wg         = 0;
-    unsigned int bitmap = 0;
 
-    size_t local_threads  = 0;
-    size_t global_threads = 0;
-
-    Get4dParams(
-        problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = Get4dParams(
+        problem, false);
 
     const std::array<size_t, 3> vld{local_threads, 1, 1};
     const std::array<size_t, 3> vgd{global_threads, 1, 1};
diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp
index d0431cb1f9..588a2c6ef9 100644
--- a/src/solver/tensorOp/Op4dTensorLite.cpp
+++ b/src/solver/tensorOp/Op4dTensorLite.cpp
@@ -50,36 +50,33 @@ bool Op4dTensorLite::IsApplicable(const ExecutionContext& context,
 
     auto asize = alens.size();
 
-    int num_wg          = 0;
-    int work_per_wg     = 0;
-    unsigned int bitmap = 0;
-
-    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
-
-    // quick fix for btensor = <1, 1, 1, 1>
-    if(bTensorDesc.GetElementSize() == 1)
-        bitmap = 4;
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
 
-    auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+    if(asize == 4)
+    {
+        auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
-    bool packed_tensor = true;
+        // quick fix for btensor = <1, 1, 1, 1>
+        if(bTensorDesc.GetElementSize() == 1)
+            bitmap = 4;
 
-    // auto alens = aTensorDesc.GetLengths();
-    packed_tensor &= aTensorDesc.IsPacked();
-    packed_tensor &= bTensorDesc.IsPacked();
-    packed_tensor &= cTensorDesc.IsPacked();
+        bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
 
-    bool packed_equal_tensor =
-        packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
+        bool packed_tensor = true;
+        packed_tensor &= aTensorDesc.IsPacked();
+        packed_tensor &= bTensorDesc.IsPacked();
+        packed_tensor &= cTensorDesc.IsPacked();
 
-    if(GetDataType(aTensorDesc.GetType()) == "double")
-    {
-        return false;
-    }
+        bool packed_equal_tensor =
+            packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
 
-    if(asize == 4 && fwd_conv_bias == 0 && packed_equal_tensor)
-    {
-        return true;
+        if(fwd_conv_bias == 0 && packed_equal_tensor)
+        {
+            return true;
+        }
     }
 
     return false;
@@ -100,20 +97,11 @@ ConvSolution Op4dTensorLite::GetSolution(const ExecutionContext& context,
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    int num_wg_orig     = 0;
-    int work_per_wg     = 0;
-    int incr_wg         = 0;
-    unsigned int bitmap = 0;
-
-    size_t local_threads  = 0;
-    size_t global_threads = 0;
-
-    Get4dParams(
-        problem, true, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
+        Get4dParams(problem, true);
 
-    size_t RD_BLCK        = size_t(1);
-    std::string READ_TYPE = "";
-    GetRDBLCKandREADTYPE(cTensorDesc.GetElementSize(), bTensorDesc.GetType(), RD_BLCK, READ_TYPE);
+    auto&& [RD_BLCK, READ_TYPE] =
+        GetRDBLCKandREADTYPE(cTensorDesc.GetElementSize(), bTensorDesc.GetType());
 
     size_t total_work = std::max(cTensorDesc.GetElementSize() / RD_BLCK, size_t(1));
 
diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp
index 1756111326..b0cb0397e0 100644
--- a/src/solver/tensorOp/Op5dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp
@@ -45,7 +45,7 @@ bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context,
     const auto& alens       = aTensorDesc.GetLengths();
     auto asize              = alens.size();
 
-    if(GetDataType(aTensorDesc.GetType()) == "double")
+    if(aTensorDesc.GetType() == miopenDouble)
     {
         return false;
     }
@@ -54,6 +54,7 @@ bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context,
     {
         return true;
     }
+
     return false;
 }
 
@@ -76,11 +77,7 @@ Op5dTensorGeneric::GetSolution(const ExecutionContext& context,
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
-    int num_wg          = 0;
-    int work_per_wg     = 0;
-    unsigned int bitmap = 0;
-
-    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
+    auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
     int num_wg_orig = num_wg;
     int max_num_wg  = 4096;
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index c2aa07c8ae..05c1984941 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -50,28 +50,26 @@ bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context,
 
     auto asize = alens.size();
 
-    int num_wg          = 0;
-    int work_per_wg     = 0;
-    unsigned int bitmap = 0;
-
-    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
-
-    // quick fix for btensor = <1, 1, 1, 1>
-    if(bTensorDesc.GetElementSize() == 1)
-        bitmap = 4;
-
-    auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
-
-    if(GetDataType(aTensorDesc.GetType()) == "double")
+    if(aTensorDesc.GetType() == miopenDouble)
     {
         return false;
     }
 
-    if(asize == 4 && fwd_conv_bias != 0)
+    if(asize == 4)
     {
-        return true;
-    }
+        auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
+        // quick fix for btensor = <1, 1, 1, 1>
+        if(bTensorDesc.GetElementSize() == 1)
+            bitmap = 4;
+
+        bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+
+        if(fwd_conv_bias != 0)
+        {
+            return true;
+        }
+    }
     return false;
 }
 
@@ -92,16 +90,9 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context,
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
     int max_num_wg      = 4096;
-    int num_wg_orig     = 0;
-    int work_per_wg     = 0;
-    int incr_wg         = 0;
-    unsigned int bitmap = 0;
-
-    size_t local_threads  = 0;
-    size_t global_threads = 0;
 
-    Get4dParams(
-        problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
+        Get4dParams(problem, false);
 
     const std::array<size_t, 3> vld{local_threads, 1, 1};
     const std::array<size_t, 3> vgd{global_threads, 1, 1};
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index 8f4a4399ce..3b99e0a8e5 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -50,41 +50,41 @@ bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context,
 
     auto asize = alens.size();
 
-    int num_wg          = 0;
-    int work_per_wg     = 0;
-    unsigned int bitmap = 0;
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
 
-    GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap);
+    if(asize == 4)
+    {
 
-    // quick fix for btensor = <1, 1, 1, 1>
-    if(bTensorDesc.GetElementSize() == 1)
-        bitmap = 4;
+        auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
-    auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+        // quick fix for btensor = <1, 1, 1, 1>
+        if(bTensorDesc.GetElementSize() == 1)
+            bitmap = 4;
 
-    bool packed_tensor = true;
-    packed_tensor &= aTensorDesc.IsPacked();
-    packed_tensor &= bTensorDesc.IsPacked();
-    packed_tensor &= cTensorDesc.IsPacked();
+        bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
 
-    bool packed_equal_tensor =
-        packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
+        bool packed_tensor = true;
+        packed_tensor &= aTensorDesc.IsPacked();
+        packed_tensor &= bTensorDesc.IsPacked();
+        packed_tensor &= cTensorDesc.IsPacked();
 
-    bool leading_ones = true;
-    // first_not_one is incorrect if btensor size equal to 1
-    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
-    auto d             = std::distance(blens.begin(), first_not_one.base());
+        bool packed_equal_tensor =
+            packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
 
-    IsBitmapLeadingOnes(bitmap, clens.size(), static_cast<int>(d - 2), leading_ones);
+        // first_not_one is incorrect if btensor size equal to 1
+        auto first_not_one =
+            std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+        auto d = std::distance(blens.begin(), first_not_one.base());
 
-    if(GetDataType(aTensorDesc.GetType()) == "double")
-    {
-        return false;
-    }
+        bool leading_ones = IsBitmapLeadingOnes(bitmap, clens.size(), static_cast<int>(d - 2));
 
-    if(asize == 4 && fwd_conv_bias == 0 && !packed_equal_tensor && leading_ones)
-    {
-        return true;
+        if(fwd_conv_bias == 0 && !packed_equal_tensor && leading_ones)
+        {
+            return true;
+        }
     }
 
     return false;
@@ -108,16 +108,9 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context,
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
     int max_num_wg      = 4096;
-    int num_wg_orig     = 0;
-    int work_per_wg     = 0;
-    int incr_wg         = 0;
-    unsigned int bitmap = 0;
-
-    size_t local_threads  = 0;
-    size_t global_threads = 0;
 
-    Get4dParams(
-        problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
+        Get4dParams(problem, false);
 
     const std::array<size_t, 3> vld{local_threads, 1, 1};
     const std::array<size_t, 3> vgd{global_threads, 1, 1};
diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp
index 02b1a137d0..2162ed9208 100644
--- a/src/solver/tensorOp/tensor_op_helpers.hpp
+++ b/src/solver/tensorOp/tensor_op_helpers.hpp
@@ -29,6 +29,8 @@
 #include <miopen/kernel_build_params.hpp>
 #include <miopen/datatype.hpp>
 
+#include <tuple>
+
 namespace miopen {
 
 namespace solver {
@@ -36,7 +38,7 @@ namespace solver {
 namespace tensorOp {
 
 inline void GetCommonParams(KernelBuildParameters& build_params,
-                            miopen::tensorOp::ProblemDescription problem,
+                            const miopen::tensorOp::ProblemDescription& problem,
                             bool is64bSupported)
 {
     build_params.Define("MIOPEN_TYPE", miopen::GetDataType(problem.GetBTensorDesc().GetType()));
@@ -59,31 +61,29 @@ inline void GetCommonParams(KernelBuildParameters& build_params,
     }
 }
 
-inline void
-GetRDBLCKandREADTYPE(size_t len, miopenDataType_t type, size_t& RD_BLCK, std::string& READ_TYPE)
+inline std::tuple<size_t, std::string> GetRDBLCKandREADTYPE(size_t len, miopenDataType_t type)
 {
-    RD_BLCK                     = (len % 4 == 0) ? 4 : (len % 2 == 0) ? 2 : 1;
     const std::string data_type = GetDataType(type);
-    READ_TYPE                   = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
+    size_t RD_BLCK              = (len % 4 == 0) ? 4 : (len % 2 == 0) ? 2 : 1;
+    return std::make_tuple(RD_BLCK,
+                           (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK));
 }
 
-inline void GetBitmapAndWgInfo(const std::vector<size_t>& blens,
-                               const std::vector<size_t>& clens,
-                               int& num_wg,
-                               int& work_per_wg,
-                               unsigned int& bitmap)
+inline std::tuple<int, int, unsigned int> GetBitmapAndWgInfo(const std::vector<size_t>& blens,
+                                                             const std::vector<size_t>& clens)
 {
     // first_not_one is incorrect if btensor size equal to 1
     auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
     auto d             = std::distance(blens.begin(), first_not_one.base());
 
     // quick fix
-    num_wg = first_not_one != blens.rend()
-                 ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
-                 : 1;
+    int num_wg = first_not_one != blens.rend()
+                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
+                     : 1;
 
-    work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
+    int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
 
+    unsigned int bitmap = 0;
     // update bitmap for first_not_one
     bitmap |= (1 << (blens.size() - d));
 
@@ -99,26 +99,23 @@ inline void GetBitmapAndWgInfo(const std::vector<size_t>& blens,
             work_per_wg *= clens[i];
         }
     }
+
+    return std::make_tuple(num_wg, work_per_wg, bitmap);
 }
 
-inline void
-IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_one, bool& leading_ones)
+inline bool IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_one)
 {
+    bool leading_ones = true;
     for(int i = first_not_one; i >= 0; i--)
     {
         bool is_one = (bitmap & (1 << (n_size - 1 - i))) != 0u;
         leading_ones &= is_one;
     }
+    return leading_ones;
 }
 
-inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem,
-                        bool is4dLite,
-                        int& num_wg_orig,
-                        int& work_per_wg,
-                        int& incr_wg,
-                        unsigned int& bitmap,
-                        size_t& local_threads,
-                        size_t& global_threads)
+inline std::tuple<int, int, int, unsigned int, size_t, size_t> Get4dParams(const miopen::tensorOp::ProblemDescription& problem,
+                        bool is4dLite)
 {
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
@@ -137,8 +134,9 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem,
                      ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
                      : 1;
 
-    work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
+    int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
 
+    unsigned int bitmap = 0;
     // update bitmap for first_not_one
     bitmap |= (1 << (blens.size() - d));
 
@@ -159,6 +157,7 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem,
     if(bTensorDesc.GetElementSize() == 1)
         bitmap = 4;
 
+    int incr_wg = 0;
     // Forward Convolution Bias specialization
     // for fwd-bias, bitmap looks like <0, 1, 0, 0>
     // Is the no. of work-groups and the work for each wg balanced?
@@ -172,14 +171,13 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem,
         incr_wg = 1;
     }
 
-    num_wg_orig    = num_wg;
+    int num_wg_orig    = num_wg;
     int max_num_wg = 4096;
     num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
 
-    local_threads = 256;
+    size_t local_threads = 256;
 
-    bool leading_ones = true;
-    IsBitmapLeadingOnes(bitmap, clens.size(), static_cast<int>(d - 2), leading_ones);
+    bool leading_ones = IsBitmapLeadingOnes(bitmap, clens.size(), static_cast<int>(d - 2));
 
     if(leading_ones && work_per_wg < 64)
     {
@@ -187,7 +185,7 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem,
     }
 
     // Special case for adding tensors in place
-    global_threads =
+    size_t global_threads =
         (static_cast<int>(leading_ones) == 1 && (d - 1) == 3) ? num_wg : num_wg * local_threads;
     global_threads = (global_threads < local_threads) ? local_threads : global_threads;
 
@@ -208,6 +206,8 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem,
 
         global_threads = glb_sz;
     }
+
+    return std::make_tuple(num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
 }
 
 } // namespace tensorOp
diff --git a/src/tensorOp/problem_description.cpp b/src/tensorOp/problem_description.cpp
index dc16276f05..4056fd3172 100644
--- a/src/tensorOp/problem_description.cpp
+++ b/src/tensorOp/problem_description.cpp
@@ -26,6 +26,7 @@
 
 #include <miopen/tensorOp/problem_description.hpp>
 #include <miopen/names.hpp>
+#include <miopen/float_equal.hpp>
 
 namespace miopen {
 
@@ -42,33 +43,27 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
     auto bstrides = bTensorDesc.GetStrides();
     auto cstrides = cTensorDesc.GetStrides();
 
-    std::string alens_str{};
-    std::string blens_str{};
-    std::string astrides_str{};
-    std::string bstrides_str{};
-    std::string cstrides_str{};
-
-    for(uint32_t i = 0; i < alens.size(); i++)
-    {
-        alens_str += std::to_string(alens[i]);
-        blens_str += std::to_string(blens[i]);
-        astrides_str += std::to_string(astrides[i]);
-        bstrides_str += std::to_string(bstrides[i]);
-        cstrides_str += std::to_string(cstrides[i]);
-
-        if(i != (alens.size() - 1))
+    auto printDims = [&ss](const auto& dim) {
+        for(uint32_t i = 0; i < dim.size(); i++)
         {
-            alens_str += "x";
-            blens_str += "x";
-            astrides_str += "x";
-            bstrides_str += "x";
-            cstrides_str += "x";
+            ss << dim[i];
+            if(i != (dim.size() - 1))
+            {
+                ss << "x";
+            }
         }
-    }
+        ss << "-";
+    };
+
+    ss << std::to_string(aTensorDesc.GetType()) << "-" << std::to_string(tensorOp) << "-";
+
+    printDims(alens);
+    printDims(blens);
+    printDims(astrides);
+    printDims(bstrides);
+    printDims(cstrides);
 
-    ss << std::to_string(aTensorDesc.GetType()) << "-" << std::to_string(tensorOp) << "-"
-       << alens_str << "-" << blens_str << "-" << astrides_str << "-" << bstrides_str << "-"
-       << cstrides_str << "-" << std::to_string((beta == 0));
+    ss << (float_equal(beta, 0.0f) ? "1" : "0");
 
     return NetworkConfig{ss.str()};
 }

From 6be98d056a5531313afc661174f643d9d908bc97 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Thu, 7 Nov 2024 16:10:46 +0200
Subject: [PATCH 10/25] clang format

---
 src/include/miopen/tensorOp/problem_description.hpp |  2 ++
 src/solver/tensorOp/Op1dTensorGeneric.cpp           |  6 +++---
 src/solver/tensorOp/Op2dTensorGeneric.cpp           |  6 +++---
 src/solver/tensorOp/Op4dTensorGeneric.cpp           |  6 +++---
 src/solver/tensorOp/OpTensorFwdBias.cpp             |  6 +++---
 src/solver/tensorOp/OpTensorLeadingOnes.cpp         |  6 +++---
 src/solver/tensorOp/tensor_op_helpers.hpp           | 13 +++++++------
 test/tensor_ops.cpp                                 |  2 +-
 8 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/src/include/miopen/tensorOp/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp
index 81621cfcbe..515955f5be 100644
--- a/src/include/miopen/tensorOp/problem_description.hpp
+++ b/src/include/miopen/tensorOp/problem_description.hpp
@@ -53,6 +53,7 @@ struct ProblemDescription : ProblemDescriptionBase
         {
             MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr");
         }
+
         beta = *(static_cast<const float*>(beta_));
 
         if(aTensorDesc.GetElementSize() != cTensorDesc.GetElementSize())
@@ -67,6 +68,7 @@ struct ProblemDescription : ProblemDescriptionBase
 
         auto blens = bTensorDesc.GetLengths();
         auto clens = cTensorDesc.GetLengths();
+
         if(clens.size() > 5)
         {
             MIOPEN_THROW("Tensor dimension larger than 5: " + std::to_string(clens.size()));
diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index a149f488e2..581b22f68e 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -43,7 +43,7 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
     const auto& alens       = aTensorDesc.GetLengths();
-    auto asize       = alens.size();
+    auto asize              = alens.size();
 
     if(aTensorDesc.GetType() == miopenDouble)
     {
@@ -94,8 +94,8 @@ Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
     auto kernel = KernelInfo{};
 
     kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
-    kernel.kernel_file = "MIOpenTensorKernelsHip.cpp";
-    kernel.kernel_name = "Op1dTensorGeneric";
+    kernel.kernel_file  = "MIOpenTensorKernelsHip.cpp";
+    kernel.kernel_name  = "Op1dTensorGeneric";
 
     using std::begin, std::end;
 
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index 3f4b03c2d1..23b7210094 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -42,7 +42,7 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context,
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
     const auto& alens       = aTensorDesc.GetLengths();
-    auto asize       = alens.size();
+    auto asize              = alens.size();
 
     if(aTensorDesc.GetType() == miopenDouble)
     {
@@ -93,8 +93,8 @@ Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
     auto kernel = KernelInfo{};
 
     kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
-    kernel.kernel_file = "MIOpenTensorKernelsHip.cpp";
-    kernel.kernel_name = "Op2dTensorGeneric";
+    kernel.kernel_file  = "MIOpenTensorKernelsHip.cpp";
+    kernel.kernel_name  = "Op2dTensorGeneric";
 
     using std::begin, std::end;
 
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
index ad17bf8791..0733981896 100644
--- a/src/solver/tensorOp/Op4dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -70,10 +70,10 @@ Op4dTensorGeneric::GetSolution(const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
-    int max_num_wg      = 4096;
+    int max_num_wg = 4096;
 
-    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = Get4dParams(
-        problem, false);
+    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
+        Get4dParams(problem, false);
 
     const std::array<size_t, 3> vld{local_threads, 1, 1};
     const std::array<size_t, 3> vgd{global_threads, 1, 1};
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index 05c1984941..cec181fcf2 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -89,7 +89,7 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context,
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    int max_num_wg      = 4096;
+    int max_num_wg = 4096;
 
     auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
         Get4dParams(problem, false);
@@ -148,7 +148,7 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context,
                     const auto& cstrides = params.cTensorDesc.GetStrides();
 
                     if(packed_tensor)
-                    {
+                    { // OpTensorFwdBias
                         kernel(params.ATensor,
                                params.BTensor,
                                static_cast<int>(blens[1]),
@@ -167,7 +167,7 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context,
                                static_cast<int>(incr_wg));
                     }
                     else
-                    {
+                    { // OpTensorFwdBiasGeneric
                         kernel(params.ATensor,
                                static_cast<int>(astrides[0]),
                                static_cast<int>(astrides[1]),
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index 3b99e0a8e5..ad4e0f3116 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -107,7 +107,7 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context,
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    int max_num_wg      = 4096;
+    int max_num_wg = 4096;
 
     auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
         Get4dParams(problem, false);
@@ -164,7 +164,7 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context,
                     const auto& cstrides = params.cTensorDesc.GetStrides();
 
                     if(packed_tensor)
-                    {
+                    { // OpTensorLeadingOnes
                         kernel(params.ATensor,
                                params.BTensor,
                                params.CTensor,
@@ -184,7 +184,7 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context,
                                bitmap);
                     }
                     else
-                    {
+                    { // OpTensorLeadingOnesGeneric
                         kernel(params.ATensor,
                                static_cast<int>(astrides[0]),
                                static_cast<int>(astrides[1]),
diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp
index 2162ed9208..46ce39e4a0 100644
--- a/src/solver/tensorOp/tensor_op_helpers.hpp
+++ b/src/solver/tensorOp/tensor_op_helpers.hpp
@@ -114,8 +114,8 @@ inline bool IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_o
     return leading_ones;
 }
 
-inline std::tuple<int, int, int, unsigned int, size_t, size_t> Get4dParams(const miopen::tensorOp::ProblemDescription& problem,
-                        bool is4dLite)
+inline std::tuple<int, int, int, unsigned int, size_t, size_t>
+Get4dParams(const miopen::tensorOp::ProblemDescription& problem, bool is4dLite)
 {
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
@@ -171,9 +171,9 @@ inline std::tuple<int, int, int, unsigned int, size_t, size_t> Get4dParams(const
         incr_wg = 1;
     }
 
-    int num_wg_orig    = num_wg;
-    int max_num_wg = 4096;
-    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
+    int num_wg_orig = num_wg;
+    int max_num_wg  = 4096;
+    num_wg          = num_wg > max_num_wg ? max_num_wg : num_wg;
 
     size_t local_threads = 256;
 
@@ -207,7 +207,8 @@ inline std::tuple<int, int, int, unsigned int, size_t, size_t> Get4dParams(const
         global_threads = glb_sz;
     }
 
-    return std::make_tuple(num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+    return std::make_tuple(
+        num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
 }
 
 } // namespace tensorOp
diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp
index ee42a94ea6..1df83044b2 100644
--- a/test/tensor_ops.cpp
+++ b/test/tensor_ops.cpp
@@ -258,7 +258,7 @@ struct tensor_ops_driver : test_driver
                 {1, 20, 16, 1},
                 {1, 20, 1, 1},
                 {1, 1, 16, 8},
-                {1, 1, 1, 8}, 
+                {1, 1, 1, 8},
                 {20, 16, 8},
                 {20, 16, 1},
                 {1, 16, 8},

From d6ffea5c372c5ae536a33040a80a75e46b118739 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Thu, 7 Nov 2024 17:00:48 +0200
Subject: [PATCH 11/25] fwd_conv_bias changed

---
 src/solver/tensorOp/Op4dTensorLite.cpp      | 4 ++--
 src/solver/tensorOp/OpTensorFwdBias.cpp     | 4 ++--
 src/solver/tensorOp/OpTensorLeadingOnes.cpp | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp
index 588a2c6ef9..6751cfc9fc 100644
--- a/src/solver/tensorOp/Op4dTensorLite.cpp
+++ b/src/solver/tensorOp/Op4dTensorLite.cpp
@@ -63,7 +63,7 @@ bool Op4dTensorLite::IsApplicable(const ExecutionContext& context,
         if(bTensorDesc.GetElementSize() == 1)
             bitmap = 4;
 
-        bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+        bool fwd_conv_bias = (bitmap == (1 << 2));
 
         bool packed_tensor = true;
         packed_tensor &= aTensorDesc.IsPacked();
@@ -73,7 +73,7 @@ bool Op4dTensorLite::IsApplicable(const ExecutionContext& context,
         bool packed_equal_tensor =
             packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
 
-        if(fwd_conv_bias == 0 && packed_equal_tensor)
+        if(!fwd_conv_bias && packed_equal_tensor)
         {
             return true;
         }
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index cec181fcf2..8586b50034 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -63,9 +63,9 @@ bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context,
         if(bTensorDesc.GetElementSize() == 1)
             bitmap = 4;
 
-        bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+        bool fwd_conv_bias = (bitmap == (1 << 2));
 
-        if(fwd_conv_bias != 0)
+        if(fwd_conv_bias)
         {
             return true;
         }
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index ad4e0f3116..d8d09461ac 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -64,7 +64,7 @@ bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context,
         if(bTensorDesc.GetElementSize() == 1)
             bitmap = 4;
 
-        bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+        bool fwd_conv_bias = (bitmap == (1 << 2));
 
         bool packed_tensor = true;
         packed_tensor &= aTensorDesc.IsPacked();
@@ -81,7 +81,7 @@ bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context,
 
         bool leading_ones = IsBitmapLeadingOnes(bitmap, clens.size(), static_cast<int>(d - 2));
 
-        if(fwd_conv_bias == 0 && !packed_equal_tensor && leading_ones)
+        if(!fwd_conv_bias && !packed_equal_tensor && leading_ones)
         {
             return true;
         }

From 89dd24c8f5dd596ffec1b4c203b86210cf9048ba Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Fri, 8 Nov 2024 09:50:57 +0200
Subject: [PATCH 12/25] tidy some part of the code

---
 src/include/miopen/tensorOp/problem_description.hpp |  4 ++--
 src/solver/tensorOp/Op1dTensorGeneric.cpp           | 12 ++++++------
 src/solver/tensorOp/Op2dTensorGeneric.cpp           | 12 ++++++------
 src/solver/tensorOp/Op2dTensorLite.cpp              | 10 +++++-----
 src/solver/tensorOp/Op2dTensorSquash.cpp            | 12 ++++++------
 src/solver/tensorOp/Op3dTensorGeneric.cpp           | 12 ++++++------
 src/solver/tensorOp/Op4dTensorGeneric.cpp           | 12 ++++++------
 src/solver/tensorOp/Op4dTensorLite.cpp              | 10 +++++-----
 src/solver/tensorOp/Op5dTensorGeneric.cpp           |  8 ++++----
 src/solver/tensorOp/OpTensorFwdBias.cpp             | 10 +++++-----
 src/solver/tensorOp/OpTensorLeadingOnes.cpp         | 12 ++++++------
 11 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/src/include/miopen/tensorOp/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp
index 515955f5be..8aa4529ee3 100644
--- a/src/include/miopen/tensorOp/problem_description.hpp
+++ b/src/include/miopen/tensorOp/problem_description.hpp
@@ -103,7 +103,7 @@ struct ProblemDescription : ProblemDescriptionBase
         }
     }
 
-    const miopenTensorOp_t GetTensorOp() const { return tensorOp; }
+    miopenTensorOp_t GetTensorOp() const { return tensorOp; }
 
     float GetBeta() const { return beta; }
 
@@ -111,7 +111,7 @@ struct ProblemDescription : ProblemDescriptionBase
     const TensorDescriptor& GetBTensorDesc() const { return bTensorDesc; }
     const TensorDescriptor& GetCTensorDesc() const { return cTensorDesc; }
 
-    const bool GetNonStandardSquash() const { return nonStandardSquash; }
+    bool GetNonStandardSquash() const { return nonStandardSquash; }
 
     NetworkConfig MakeNetworkConfig() const override;
 
diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index 581b22f68e..b66274ad58 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -38,7 +38,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
+bool Op1dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -58,16 +58,16 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-Op1dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
-                                    const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t Op1dTensorGeneric::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-Op1dTensorGeneric::GetSolution(const ExecutionContext& context,
-                               const miopen::tensorOp::ProblemDescription& problem) const
+    Op1dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context,
+                                   const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index 23b7210094..f910f63507 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context,
+bool Op2dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -57,16 +57,16 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-Op2dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
-                                    const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t Op2dTensorGeneric::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-Op2dTensorGeneric::GetSolution(const ExecutionContext& context,
-                               const miopen::tensorOp::ProblemDescription& problem) const
+    Op2dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context,
+                                   const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
index d76c4f57f6..73f3659081 100644
--- a/src/solver/tensorOp/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -38,7 +38,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op2dTensorLite::IsApplicable(const ExecutionContext& context,
+bool Op2dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                   const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -81,14 +81,14 @@ bool Op2dTensorLite::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-Op2dTensorLite::GetWorkspaceSize(const ExecutionContext& context,
-                                 const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t Op2dTensorLite::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
-ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context,
+ConvSolution Op2dTensorLite::GetSolution([[maybe unused]] const ExecutionContext& context,
                                          const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
index 93f2868905..2f793a3fb7 100644
--- a/src/solver/tensorOp/Op2dTensorSquash.cpp
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context,
+bool Op2dTensorSquash::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                     const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -73,16 +73,16 @@ bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-Op2dTensorSquash::GetWorkspaceSize(const ExecutionContext& context,
-                                   const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t Op2dTensorSquash::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-Op2dTensorSquash::GetSolution(const ExecutionContext& context,
-                              const miopen::tensorOp::ProblemDescription& problem) const
+    Op2dTensorSquash::GetSolution([[maybe unused]] const ExecutionContext& context,
+                                  const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index 1aeb83509d..d10d096536 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op3dTensorGeneric::IsApplicable(const ExecutionContext& context,
+bool Op3dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -57,16 +57,16 @@ bool Op3dTensorGeneric::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-Op3dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
-                                    const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t Op3dTensorGeneric::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-Op3dTensorGeneric::GetSolution(const ExecutionContext& context,
-                               const miopen::tensorOp::ProblemDescription& problem) const
+    Op3dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context,
+                                   const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
index 0733981896..ed070dc684 100644
--- a/src/solver/tensorOp/Op4dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op4dTensorGeneric::IsApplicable(const ExecutionContext& context,
+bool Op4dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -57,16 +57,16 @@ bool Op4dTensorGeneric::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-Op4dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
-                                    const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t Op4dTensorGeneric::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-Op4dTensorGeneric::GetSolution(const ExecutionContext& context,
-                               const miopen::tensorOp::ProblemDescription& problem) const
+    Op4dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context,
+                                   const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp
index 6751cfc9fc..41d6cdbe26 100644
--- a/src/solver/tensorOp/Op4dTensorLite.cpp
+++ b/src/solver/tensorOp/Op4dTensorLite.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op4dTensorLite::IsApplicable(const ExecutionContext& context,
+bool Op4dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                   const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -82,14 +82,14 @@ bool Op4dTensorLite::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-Op4dTensorLite::GetWorkspaceSize(const ExecutionContext& context,
-                                 const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t Op4dTensorLite::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
-ConvSolution Op4dTensorLite::GetSolution(const ExecutionContext& context,
+ConvSolution Op4dTensorLite::GetSolution([[maybe unused]] const ExecutionContext& context,
                                          const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp
index b0cb0397e0..e87246c384 100644
--- a/src/solver/tensorOp/Op5dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp
@@ -38,7 +38,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context,
+bool Op5dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -58,9 +58,9 @@ bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-Op5dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context,
-                                    const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t Op5dTensorGeneric::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index 8586b50034..a1433c1f7f 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context,
+bool OpTensorFwdBias::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                    const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -73,14 +73,14 @@ bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-OpTensorFwdBias::GetWorkspaceSize(const ExecutionContext& context,
-                                  const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t OpTensorFwdBias::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
-ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context,
+ConvSolution OpTensorFwdBias::GetSolution([[maybe unused]] const ExecutionContext& context,
                                           const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index d8d09461ac..4526c05329 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context,
+bool OpTensorLeadingOnes::IsApplicable([[maybe unused]] const ExecutionContext& context,
                                        const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -90,16 +90,16 @@ bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context,
     return false;
 }
 
-std::size_t
-OpTensorLeadingOnes::GetWorkspaceSize(const ExecutionContext& context,
-                                      const miopen::tensorOp::ProblemDescription& problem) const
+std::size_t OpTensorLeadingOnes::GetWorkspaceSize(
+    [[maybe unused]] const ExecutionContext& context,
+    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-OpTensorLeadingOnes::GetSolution(const ExecutionContext& context,
-                                 const miopen::tensorOp::ProblemDescription& problem) const
+    OpTensorLeadingOnes::GetSolution([[maybe unused]] const ExecutionContext& context,
+                                     const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 

From 5a9b5edd1420faaaafdfdc2e4e43f9413e1d9459 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Fri, 8 Nov 2024 10:08:23 +0200
Subject: [PATCH 13/25] fix typos

---
 src/solver/tensorOp/Op1dTensorGeneric.cpp   | 10 +++++-----
 src/solver/tensorOp/Op2dTensorGeneric.cpp   | 10 +++++-----
 src/solver/tensorOp/Op2dTensorLite.cpp      |  8 ++++----
 src/solver/tensorOp/Op2dTensorSquash.cpp    | 10 +++++-----
 src/solver/tensorOp/Op3dTensorGeneric.cpp   | 10 +++++-----
 src/solver/tensorOp/Op4dTensorGeneric.cpp   | 10 +++++-----
 src/solver/tensorOp/Op4dTensorLite.cpp      |  8 ++++----
 src/solver/tensorOp/Op5dTensorGeneric.cpp   |  6 +++---
 src/solver/tensorOp/OpTensorFwdBias.cpp     |  8 ++++----
 src/solver/tensorOp/OpTensorLeadingOnes.cpp | 10 +++++-----
 10 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index b66274ad58..2b5338134b 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -38,7 +38,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op1dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool Op1dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -59,15 +59,15 @@ bool Op1dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co
 }
 
 std::size_t Op1dTensorGeneric::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-    Op1dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context,
-                                   const miopen::tensorOp::ProblemDescription& problem) const
+Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index f910f63507..ac5658f144 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op2dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool Op2dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -58,15 +58,15 @@ bool Op2dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co
 }
 
 std::size_t Op2dTensorGeneric::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-    Op2dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context,
-                                   const miopen::tensorOp::ProblemDescription& problem) const
+Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
index 73f3659081..3eb8688cd1 100644
--- a/src/solver/tensorOp/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -38,7 +38,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op2dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool Op2dTensorLite::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                   const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -82,13 +82,13 @@ bool Op2dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& conte
 }
 
 std::size_t Op2dTensorLite::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
-ConvSolution Op2dTensorLite::GetSolution([[maybe unused]] const ExecutionContext& context,
+ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext& context,
                                          const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
index 2f793a3fb7..b5bee28bd5 100644
--- a/src/solver/tensorOp/Op2dTensorSquash.cpp
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op2dTensorSquash::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool Op2dTensorSquash::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                     const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -74,15 +74,15 @@ bool Op2dTensorSquash::IsApplicable([[maybe unused]] const ExecutionContext& con
 }
 
 std::size_t Op2dTensorSquash::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-    Op2dTensorSquash::GetSolution([[maybe unused]] const ExecutionContext& context,
-                                  const miopen::tensorOp::ProblemDescription& problem) const
+Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                              const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index d10d096536..049fb6860f 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op3dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool Op3dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -58,15 +58,15 @@ bool Op3dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co
 }
 
 std::size_t Op3dTensorGeneric::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-    Op3dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context,
-                                   const miopen::tensorOp::ProblemDescription& problem) const
+Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
index ed070dc684..92f179f772 100644
--- a/src/solver/tensorOp/Op4dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op4dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool Op4dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -58,15 +58,15 @@ bool Op4dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co
 }
 
 std::size_t Op4dTensorGeneric::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-    Op4dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context,
-                                   const miopen::tensorOp::ProblemDescription& problem) const
+Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp
index 41d6cdbe26..96ca761063 100644
--- a/src/solver/tensorOp/Op4dTensorLite.cpp
+++ b/src/solver/tensorOp/Op4dTensorLite.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op4dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool Op4dTensorLite::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                   const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -83,13 +83,13 @@ bool Op4dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& conte
 }
 
 std::size_t Op4dTensorLite::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
-ConvSolution Op4dTensorLite::GetSolution([[maybe unused]] const ExecutionContext& context,
+ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext& context,
                                          const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp
index e87246c384..63a7f5ddbc 100644
--- a/src/solver/tensorOp/Op5dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp
@@ -38,7 +38,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool Op5dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool Op5dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                      const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -59,8 +59,8 @@ bool Op5dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co
 }
 
 std::size_t Op5dTensorGeneric::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index a1433c1f7f..09a595582b 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool OpTensorFwdBias::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool OpTensorFwdBias::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                    const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -74,13 +74,13 @@ bool OpTensorFwdBias::IsApplicable([[maybe unused]] const ExecutionContext& cont
 }
 
 std::size_t OpTensorFwdBias::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
-ConvSolution OpTensorFwdBias::GetSolution([[maybe unused]] const ExecutionContext& context,
+ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContext& context,
                                           const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index 4526c05329..11d33005b7 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -37,7 +37,7 @@ namespace solver {
 
 namespace tensorOp {
 
-bool OpTensorLeadingOnes::IsApplicable([[maybe unused]] const ExecutionContext& context,
+bool OpTensorLeadingOnes::IsApplicable([[maybe_unused]] const ExecutionContext& context,
                                        const miopen::tensorOp::ProblemDescription& problem) const
 {
     const auto& aTensorDesc = problem.GetATensorDesc();
@@ -91,15 +91,15 @@ bool OpTensorLeadingOnes::IsApplicable([[maybe unused]] const ExecutionContext&
 }
 
 std::size_t OpTensorLeadingOnes::GetWorkspaceSize(
-    [[maybe unused]] const ExecutionContext& context,
-    [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
 {
     return 0;
 }
 
 ConvSolution
-    OpTensorLeadingOnes::GetSolution([[maybe unused]] const ExecutionContext& context,
-                                     const miopen::tensorOp::ProblemDescription& problem) const
+OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                 const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 

From c9f310aa852c74f2e59335956b36e5049ac10481 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Fri, 15 Nov 2024 19:37:03 +0200
Subject: [PATCH 14/25] implementnting suggestions, updating network_config and
 changes to potentially boost performance on host side

---
 src/include/miopen/names.hpp                  |   1 +
 src/include/miopen/tensorOp/invoke_params.hpp |  25 +---
 .../miopen/tensorOp/problem_description.hpp   |  24 ++--
 src/solver/tensorOp/Op1dTensorGeneric.cpp     |  53 ++++----
 src/solver/tensorOp/Op2dTensorGeneric.cpp     |  92 +++++++++-----
 src/solver/tensorOp/Op2dTensorLite.cpp        |  33 +++--
 src/solver/tensorOp/Op2dTensorSquash.cpp      |  18 +--
 src/solver/tensorOp/Op3dTensorGeneric.cpp     |  29 +++--
 src/solver/tensorOp/Op4dTensorGeneric.cpp     |  40 ++++--
 src/solver/tensorOp/Op4dTensorLite.cpp        |   6 +-
 src/solver/tensorOp/Op5dTensorGeneric.cpp     | 114 ++++++++++--------
 src/solver/tensorOp/OpTensorFwdBias.cpp       |  37 ++++--
 src/solver/tensorOp/OpTensorLeadingOnes.cpp   |  33 +++--
 src/tensor.cpp                                |  16 +--
 src/tensorOp/problem_description.cpp          |  37 +++---
 15 files changed, 328 insertions(+), 230 deletions(-)

diff --git a/src/include/miopen/names.hpp b/src/include/miopen/names.hpp
index 17b96b8732..bdf59c361c 100644
--- a/src/include/miopen/names.hpp
+++ b/src/include/miopen/names.hpp
@@ -34,6 +34,7 @@ struct NetworkConfig
 {
     NetworkConfig() = default;
     explicit NetworkConfig(const std::string& value_) : value(value_) {}
+    explicit NetworkConfig(std::string&& value_) noexcept : value(std::move(value_)) {}
     operator std::string() const { return value; }
     const std::string& ToString() const { return value; }
 
diff --git a/src/include/miopen/tensorOp/invoke_params.hpp b/src/include/miopen/tensorOp/invoke_params.hpp
index 99ff13da47..6b8f2ca88c 100644
--- a/src/include/miopen/tensorOp/invoke_params.hpp
+++ b/src/include/miopen/tensorOp/invoke_params.hpp
@@ -35,34 +35,24 @@ namespace tensorOp {
 
 struct InvokeParams : public miopen::InvokeParams
 {
-    InvokeParams(miopenTensorOp_t tensorOp_,
-                 const void* alpha0_,
-                 const TensorDescriptor& aTensorDesc_,
+    InvokeParams(const void* alpha0_,
                  ConstData_t ATensor_,
                  const void* alpha1_,
-                 const TensorDescriptor& bTensorDesc_,
                  ConstData_t BTensor_,
                  const void* beta_,
-                 const TensorDescriptor& cTensorDesc_,
                  Data_t CTensor_,
                  const size_t Aoffset_,
                  const size_t Boffset_,
-                 const size_t Coffset_,
-                 const bool nonStandardSquash_)
+                 const size_t Coffset_)
         : alpha0(alpha0_),
           alpha1(alpha1_),
           beta(beta_),
-          tensorOperation(tensorOp_),
-          aTensorDesc(aTensorDesc_),
           ATensor(ATensor_),
-          bTensorDesc(bTensorDesc_),
           BTensor(BTensor_),
-          cTensorDesc(cTensorDesc_),
           CTensor(CTensor_),
           Aoffset(Aoffset_),
           Boffset(Boffset_),
-          Coffset(Coffset_),
-          nonStandardSquash(nonStandardSquash_)
+          Coffset(Coffset_)
     {
     }
 
@@ -74,22 +64,13 @@ struct InvokeParams : public miopen::InvokeParams
     const void* alpha1;
     const void* beta;
 
-    miopenTensorOp_t tensorOperation;
-
-    TensorDescriptor aTensorDesc;
     ConstData_t ATensor;
-
-    TensorDescriptor bTensorDesc;
     ConstData_t BTensor;
-
-    TensorDescriptor cTensorDesc;
     Data_t CTensor;
 
     size_t Aoffset;
     size_t Boffset;
     size_t Coffset;
-
-    bool nonStandardSquash;
 };
 
 } // namespace tensorOp
diff --git a/src/include/miopen/tensorOp/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp
index 8aa4529ee3..dc60a3c7c9 100644
--- a/src/include/miopen/tensorOp/problem_description.hpp
+++ b/src/include/miopen/tensorOp/problem_description.hpp
@@ -66,8 +66,8 @@ struct ProblemDescription : ProblemDescriptionBase
             MIOPEN_THROW("Datatypes for B and C tensors do not match !");
         }
 
-        auto blens = bTensorDesc.GetLengths();
-        auto clens = cTensorDesc.GetLengths();
+        const auto& blens = bTensorDesc.GetLengths();
+        const auto& clens = cTensorDesc.GetLengths();
 
         if(clens.size() > 5)
         {
@@ -82,14 +82,12 @@ struct ProblemDescription : ProblemDescriptionBase
 
         if(!nonStandardSquash)
         {
-            for(std::size_t i = 0; i < clens.size(); i++)
-            {
-                if(blens[i] != 1 && blens[i] != clens[i])
-                {
-                    MIOPEN_THROW("BTensor dim != 1 && BTensor dim != CTensor dim: " +
-                                 std::to_string(i));
-                }
-            }
+            constexpr auto comparator = [](size_t c, size_t b) { return b == 1 || b == c; };
+            const auto [c_diff, b_diff] =
+                std::mismatch(clens.begin(), clens.end(), blens.begin(), comparator);
+            if(c_diff != clens.end())
+                MIOPEN_THROW("BTensor dim != 1 && BTensor dim != CTensor dim:" +
+                             std::to_string(std::distance(clens.begin(), c_diff)));
         }
         else
         {
@@ -120,9 +118,9 @@ struct ProblemDescription : ProblemDescriptionBase
 
     float beta;
 
-    const TensorDescriptor& aTensorDesc;
-    const TensorDescriptor& bTensorDesc;
-    const TensorDescriptor& cTensorDesc;
+    TensorDescriptor aTensorDesc;
+    TensorDescriptor bTensorDesc;
+    TensorDescriptor cTensorDesc;
 
     const bool nonStandardSquash;
 };
diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index 2b5338134b..7a2662e60b 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -71,14 +71,24 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    const auto& clens = cTensorDesc.GetLengths();
+    const size_t b_n = bTensorDesc.GetLengths()[0];
+    const size_t c_n = cTensorDesc.GetLengths()[0];
+
+    const size_t a_nstrides = aTensorDesc.GetStrides()[0];
+    const size_t b_nstrides = bTensorDesc.GetStrides()[0];
+    const size_t c_nstrides = cTensorDesc.GetStrides()[0];
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+    bool fit_into_int          = aTensorDesc.AllDimsFitIntoInt();
 
     size_t local_threads = 256;
     size_t max_num_wg    = 4096;
 
-    auto num_wg           = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg));
+    auto num_wg           = std::clamp(c_n / local_threads, size_t(1), size_t(max_num_wg));
     num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
     size_t global_threads = num_wg * local_threads;
 
@@ -102,38 +112,33 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [](const std::vector<Kernel> kernels) {
+    result
+        .invoker_factory = [data_type, fit_into_int, b_n, c_n, a_nstrides, b_nstrides, c_nstrides](
+                               const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(data_type, [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                const auto& blens = params.bTensorDesc.GetLengths();
-                const auto& clens = params.cTensorDesc.GetLengths();
-
-                const auto& astrides = params.aTensorDesc.GetStrides();
-                const auto& bstrides = params.bTensorDesc.GetStrides();
-                const auto& cstrides = params.cTensorDesc.GetStrides();
-
-                if(params.aTensorDesc.AllDimsFitIntoInt())
-                { // change offsets to 64bit after PR is merged
+                if(fit_into_int)
+                {
                     kernel(params.ATensor,
                            params.BTensor,
                            params.CTensor,
                            static_cast<uint32_t>(params.Aoffset),
                            static_cast<uint32_t>(params.Boffset),
                            static_cast<uint32_t>(params.Coffset),
-                           static_cast<uint32_t>(astrides[0]),
-                           static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                           static_cast<uint32_t>(cstrides[0]),
+                           static_cast<uint32_t>(a_nstrides),
+                           static_cast<uint32_t>(b_n == 1 ? 0 : b_nstrides),
+                           static_cast<uint32_t>(c_nstrides),
                            miopen_alpha0,
                            miopen_alpha1,
                            miopen_beta,
-                           static_cast<uint32_t>(clens[0]),
+                           static_cast<uint32_t>(c_n),
                            !float_equal(miopen_beta, 0.0));
                 }
                 else
@@ -141,16 +146,16 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
                     kernel(params.ATensor,
                            params.BTensor,
                            params.CTensor,
-                           static_cast<uint32_t>(params.Aoffset),
-                           static_cast<uint32_t>(params.Boffset),
-                           static_cast<uint32_t>(params.Coffset),
-                           static_cast<uint64_t>(astrides[0]),
-                           static_cast<uint64_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                           static_cast<uint64_t>(cstrides[0]),
+                           static_cast<uint64_t>(params.Aoffset),
+                           static_cast<uint64_t>(params.Boffset),
+                           static_cast<uint64_t>(params.Coffset),
+                           static_cast<uint64_t>(a_nstrides),
+                           static_cast<uint64_t>(b_n == 1 ? 0 : b_nstrides),
+                           static_cast<uint64_t>(c_nstrides),
                            miopen_alpha0,
                            miopen_alpha1,
                            miopen_beta,
-                           static_cast<uint64_t>(clens[0]),
+                           static_cast<uint64_t>(c_n),
                            !float_equal(miopen_beta, 0.0));
                 }
             });
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index ac5658f144..917eee17c2 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -70,9 +70,24 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
-    const auto& clens = cTensorDesc.GetLengths();
+    std::array<size_t, 2> blens;
+    std::array<size_t, 2> clens;
+    std::tie(blens[0], blens[1]) = miopen::tien<2>(bTensorDesc.GetLengths());
+    std::tie(clens[0], clens[1]) = miopen::tien<2>(cTensorDesc.GetLengths());
+
+    std::array<size_t, 2> astrides;
+    std::array<size_t, 2> bstrides;
+    std::array<size_t, 2> cstrides;
+    std::tie(astrides[0], astrides[1]) = miopen::tien<2>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1]) = miopen::tien<2>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1]) = miopen::tien<2>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+    bool fit_into_int          = aTensorDesc.AllDimsFitIntoInt();
 
     size_t local_threads = 32;
     size_t max_num_wg    = 4096;
@@ -101,42 +116,61 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [](const std::vector<Kernel> kernels) {
+    result.invoker_factory = [data_type, fit_into_int, blens, clens, astrides, bstrides, cstrides](
+                                 const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(data_type, [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                const auto& blens = params.bTensorDesc.GetLengths();
-                const auto& clens = params.cTensorDesc.GetLengths();
-
-                const auto& astrides = params.aTensorDesc.GetStrides();
-                const auto& bstrides = params.bTensorDesc.GetStrides();
-                const auto& cstrides = params.cTensorDesc.GetStrides();
-
-                kernel(params.ATensor,
-                       params.BTensor,
-                       params.CTensor,
-                       static_cast<long>(params.Aoffset),
-                       static_cast<long>(params.Boffset),
-                       static_cast<long>(params.Coffset),
-                       static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]),
-                       static_cast<uint32_t>(clens[1]),
-                       static_cast<uint32_t>(astrides[0]),
-                       static_cast<uint32_t>(astrides[1]),
-                       static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                       static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),
-                       static_cast<uint32_t>(cstrides[0]),
-                       static_cast<uint32_t>(cstrides[1]),
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       static_cast<uint32_t>(clens[0]),
-                       !float_equal(miopen_beta, 0.0));
+                if(fit_into_int)
+                {
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           params.CTensor,
+                           static_cast<uint64_t>(params.Aoffset),
+                           static_cast<uint64_t>(params.Boffset),
+                           static_cast<uint64_t>(params.Coffset),
+                           static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]),
+                           static_cast<uint32_t>(clens[1]),
+                           static_cast<uint32_t>(astrides[0]),
+                           static_cast<uint32_t>(astrides[1]),
+                           static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
+                           static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),
+                           static_cast<uint32_t>(cstrides[0]),
+                           static_cast<uint32_t>(cstrides[1]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<uint32_t>(clens[0]),
+                           !float_equal(miopen_beta, 0.0));
+                }
+                else
+                {
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           params.CTensor,
+                           static_cast<uint64_t>(params.Aoffset),
+                           static_cast<uint64_t>(params.Boffset),
+                           static_cast<uint64_t>(params.Coffset),
+                           static_cast<uint64_t>(blens[1] == 1 ? clens[1] : blens[1]),
+                           static_cast<uint64_t>(clens[1]),
+                           static_cast<uint64_t>(astrides[0]),
+                           static_cast<uint64_t>(astrides[1]),
+                           static_cast<uint64_t>(blens[0] == 1 ? 0 : bstrides[0]),
+                           static_cast<uint64_t>(blens[1] == 1 ? 0 : bstrides[1]),
+                           static_cast<uint64_t>(cstrides[0]),
+                           static_cast<uint64_t>(cstrides[1]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<uint64_t>(clens[0]),
+                           !float_equal(miopen_beta, 0.0));
+                }
             });
         };
     };
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
index 3eb8688cd1..696d73b073 100644
--- a/src/solver/tensorOp/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -93,12 +93,19 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
+    const auto& aTensorDesc = problem.GetATensorDesc();
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
+    const size_t a_cstride = aTensorDesc.GetStrides()[1];
+    const size_t b_cstride = bTensorDesc.GetStrides()[1];
+    const size_t c_cstride = cTensorDesc.GetStrides()[1];
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
     auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
     int max_num_wg = 4096;
@@ -107,7 +114,7 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
     size_t local_threads = 256;
 
     // for naive tensor ops
-    auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType());
+    auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], data_type);
 
     size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
     size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
@@ -143,28 +150,28 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [total_work, total_work2](const std::vector<Kernel> kernels) {
+    result.invoker_factory = [data_type,
+                              b_c = blens[1],
+                              a_cstride,
+                              b_cstride,
+                              c_cstride,
+                              total_work,
+                              total_work2](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(data_type, [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                const auto& blens = params.bTensorDesc.GetLengths();
-
-                const auto& astrides = params.aTensorDesc.GetStrides();
-                const auto& bstrides = params.bTensorDesc.GetStrides();
-                const auto& cstrides = params.cTensorDesc.GetStrides();
-
                 kernel(params.ATensor,
-                       static_cast<int>(astrides[1]),
+                       static_cast<int>(a_cstride),
                        params.BTensor,
-                       static_cast<int>(bstrides[1]),
+                       static_cast<int>(b_cstride),
                        params.CTensor,
-                       static_cast<int>(cstrides[1]),
+                       static_cast<int>(c_cstride),
                        miopen_alpha0,
                        miopen_alpha1,
                        miopen_beta,
@@ -174,7 +181,7 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
                        static_cast<int64_t>(total_work),
                        static_cast<int64_t>(total_work2),
                        static_cast<int>(!float_equal(miopen_beta, 0.0)),
-                       static_cast<int>(blens[1] == 1));
+                       static_cast<int>(b_c == 1));
             });
         };
     };
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
index b5bee28bd5..0f400eab2d 100644
--- a/src/solver/tensorOp/Op2dTensorSquash.cpp
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -92,6 +92,10 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context,
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
+    const size_t b_nstride = bTensorDesc.GetStrides()[1];
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
     auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
     int max_num_wg = 4096;
@@ -100,7 +104,7 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context,
     size_t local_threads = 256;
 
     // for naive tensor ops
-    auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType());
+    auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], data_type);
 
     size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
     size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
@@ -130,23 +134,21 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [total_work](const std::vector<Kernel> kernels) {
+    result.invoker_factory = [data_type, b_c = blens[1], b_nstride, total_work](
+                                 const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(data_type, [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                const auto& blens    = params.bTensorDesc.GetLengths();
-                const auto& bstrides = params.bTensorDesc.GetStrides();
-
                 kernel(params.ATensor,
                        params.BTensor,
-                       static_cast<int>(blens[1]),
-                       static_cast<int>(bstrides[1]),
+                       static_cast<int>(b_c),
+                       static_cast<int>(b_nstride),
                        params.CTensor,
                        miopen_alpha0,
                        miopen_alpha1,
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index 049fb6860f..f4cdb191cf 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -70,12 +70,22 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
+    const auto& aTensorDesc = problem.GetATensorDesc();
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
+    std::array<size_t, 3> astrides;
+    std::array<size_t, 3> bstrides;
+    std::array<size_t, 3> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2]) = miopen::tien<3>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2]) = miopen::tien<3>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2]) = miopen::tien<3>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
     auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
     int num_wg_orig = num_wg;
@@ -106,23 +116,24 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [bitmap, work_per_wg, num_wg_orig](const std::vector<Kernel> kernels) {
+    result.invoker_factory = [data_type,
+                              blens,
+                              clens,
+                              astrides,
+                              bstrides,
+                              cstrides,
+                              bitmap,
+                              work_per_wg,
+                              num_wg_orig](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(data_type, [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                const auto& blens = params.bTensorDesc.GetLengths();
-                const auto& clens = params.cTensorDesc.GetLengths();
-
-                const auto& astrides = params.aTensorDesc.GetStrides();
-                const auto& bstrides = params.bTensorDesc.GetStrides();
-                const auto& cstrides = params.cTensorDesc.GetStrides();
-
                 kernel(params.ATensor,
                        static_cast<int>(astrides[0]),
                        static_cast<int>(astrides[1]),
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
index 92f179f772..93b9b9847b 100644
--- a/src/solver/tensorOp/Op4dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -70,6 +70,27 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    std::array<size_t, 4> blens;
+    std::array<size_t, 4> clens;
+    std::tie(blens[0], blens[1], blens[2], blens[3]) = miopen::tien<4>(bTensorDesc.GetLengths());
+    std::tie(clens[0], clens[1], clens[2], clens[3]) = miopen::tien<4>(cTensorDesc.GetLengths());
+
+    std::array<size_t, 4> astrides;
+    std::array<size_t, 4> bstrides;
+    std::array<size_t, 4> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2], astrides[3]) =
+        miopen::tien<4>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3]) =
+        miopen::tien<4>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3]) =
+        miopen::tien<4>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
     int max_num_wg = 4096;
 
     auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
@@ -95,23 +116,24 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [work_per_wg, num_wg_orig, bitmap](const std::vector<Kernel> kernels) {
+    result.invoker_factory = [data_type,
+                              blens,
+                              clens,
+                              astrides,
+                              bstrides,
+                              cstrides,
+                              work_per_wg,
+                              num_wg_orig,
+                              bitmap](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(data_type, [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                const auto& blens = params.bTensorDesc.GetLengths();
-                const auto& clens = params.cTensorDesc.GetLengths();
-
-                const auto& astrides = params.aTensorDesc.GetStrides();
-                const auto& bstrides = params.bTensorDesc.GetStrides();
-                const auto& cstrides = params.cTensorDesc.GetStrides();
-
                 kernel(params.ATensor,
                        static_cast<int>(astrides[0]), // a_nstride,
                        static_cast<int>(astrides[1]), // a_cstride,
diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp
index 96ca761063..a53174507e 100644
--- a/src/solver/tensorOp/Op4dTensorLite.cpp
+++ b/src/solver/tensorOp/Op4dTensorLite.cpp
@@ -97,6 +97,8 @@ ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
     auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
         Get4dParams(problem, true);
 
@@ -127,12 +129,12 @@ ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [total_work](const std::vector<Kernel> kernels) {
+    result.invoker_factory = [data_type, total_work](const std::vector<Kernel> kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+            visit_float(data_type, [&](auto as_float) {
                 auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                 auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                 auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp
index 63a7f5ddbc..35ef705f5b 100644
--- a/src/solver/tensorOp/Op5dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp
@@ -66,17 +66,30 @@ std::size_t Op5dTensorGeneric::GetWorkspaceSize(
 }
 
 ConvSolution
-Op5dTensorGeneric::GetSolution(const ExecutionContext& context,
+Op5dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
                                const miopen::tensorOp::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
-    const auto& cTensorDesc = problem.GetCTensorDesc();
+    const auto& aTensorDesc = problem.GetATensorDesc();
     const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
 
     const auto& blens = bTensorDesc.GetLengths();
     const auto& clens = cTensorDesc.GetLengths();
 
+    std::array<size_t, 5> astrides;
+    std::array<size_t, 5> bstrides;
+    std::array<size_t, 5> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2], astrides[3], astrides[4]) =
+        miopen::tien<5>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3], bstrides[4]) =
+        miopen::tien<5>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3], cstrides[4]) =
+        miopen::tien<5>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
     auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
     int num_wg_orig = num_wg;
@@ -107,58 +120,53 @@ Op5dTensorGeneric::GetSolution(const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [bitmap, work_per_wg, num_wg_orig](const std::vector<Kernel> kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
-
-            visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
-                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
-                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
-                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
-
-                const auto& blens = params.bTensorDesc.GetLengths();
-                const auto& clens = params.cTensorDesc.GetLengths();
-
-                const auto& astrides = params.aTensorDesc.GetStrides();
-                const auto& bstrides = params.bTensorDesc.GetStrides();
-                const auto& cstrides = params.cTensorDesc.GetStrides();
-
-                kernel(params.ATensor,
-                       static_cast<int>(astrides[0]),
-                       static_cast<int>(astrides[1]),
-                       static_cast<int>(astrides[2]),
-                       static_cast<int>(astrides[3]),
-                       params.BTensor,
-                       static_cast<int>(blens[1]),    // b_c,
-                       static_cast<int>(blens[2]),    // b_d,
-                       static_cast<int>(blens[3]),    // b_h,
-                       static_cast<int>(blens[4]),    // b_w,
-                       static_cast<int>(bstrides[0]), // b_nstride,
-                       static_cast<int>(bstrides[1]), // b_cstride,
-                       static_cast<int>(bstrides[2]), // b_dstride,
-                       static_cast<int>(bstrides[3]), // b_hstride,
-                       params.CTensor,
-                       static_cast<int>(clens[1]),    // c_c,
-                       static_cast<int>(clens[2]),    // c_d,
-                       static_cast<int>(clens[3]),    // c_h,
-                       static_cast<int>(clens[4]),    // c_w,
-                       static_cast<int>(cstrides[0]), // c_nstride,
-                       static_cast<int>(cstrides[1]), // c_cstride,
-                       static_cast<int>(cstrides[2]), // c_dstride,
-                       static_cast<int>(cstrides[3]), // c_hstride,
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       bitmap,
-                       work_per_wg,
-                       static_cast<int64_t>(params.Aoffset),
-                       static_cast<int64_t>(params.Boffset),
-                       static_cast<int64_t>(params.Coffset),
-                       static_cast<int>(num_wg_orig));
-            });
+    result.invoker_factory =
+        [data_type, blens, clens, astrides, bstrides, cstrides, bitmap, work_per_wg, num_wg_orig](
+            const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           static_cast<int>(astrides[0]),
+                           static_cast<int>(astrides[1]),
+                           static_cast<int>(astrides[2]),
+                           static_cast<int>(astrides[3]),
+                           params.BTensor,
+                           static_cast<int>(blens[1]),    // b_c,
+                           static_cast<int>(blens[2]),    // b_d,
+                           static_cast<int>(blens[3]),    // b_h,
+                           static_cast<int>(blens[4]),    // b_w,
+                           static_cast<int>(bstrides[0]), // b_nstride,
+                           static_cast<int>(bstrides[1]), // b_cstride,
+                           static_cast<int>(bstrides[2]), // b_dstride,
+                           static_cast<int>(bstrides[3]), // b_hstride,
+                           params.CTensor,
+                           static_cast<int>(clens[1]),    // c_c,
+                           static_cast<int>(clens[2]),    // c_d,
+                           static_cast<int>(clens[3]),    // c_h,
+                           static_cast<int>(clens[4]),    // c_w,
+                           static_cast<int>(cstrides[0]), // c_nstride,
+                           static_cast<int>(cstrides[1]), // c_cstride,
+                           static_cast<int>(cstrides[2]), // c_dstride,
+                           static_cast<int>(cstrides[3]), // c_hstride,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           bitmap,
+                           work_per_wg,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig));
+                });
+            };
         };
-    };
     result.construction_params.push_back(kernel);
 
     return result;
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index 09a595582b..16e5d19d70 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -89,6 +89,23 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
+    std::array<size_t, 4> blens;
+    std::array<size_t, 4> clens;
+    std::tie(blens[0], blens[1], blens[2], blens[3]) = miopen::tien<4>(bTensorDesc.GetLengths());
+    std::tie(clens[0], clens[1], clens[2], clens[3]) = miopen::tien<4>(cTensorDesc.GetLengths());
+
+    std::array<size_t, 4> astrides;
+    std::array<size_t, 4> bstrides;
+    std::array<size_t, 4> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2], astrides[3]) =
+        miopen::tien<4>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3]) =
+        miopen::tien<4>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3]) =
+        miopen::tien<4>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
     int max_num_wg = 4096;
 
     auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
@@ -130,23 +147,25 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
     result.invoker_factory =
-        [work_per_wg, num_wg_orig, incr_wg, packed_tensor](const std::vector<Kernel> kernels) {
+        [data_type,
+         blens,
+         clens,
+         astrides,
+         bstrides,
+         cstrides,
+         work_per_wg,
+         num_wg_orig,
+         incr_wg,
+         packed_tensor](const std::vector<Kernel> kernels) {
             return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
                 decltype(auto) kernel = handle_.Run(kernels.front());
                 decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-                visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+                visit_float(data_type, [&](auto as_float) {
                     auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                     auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                     auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                    const auto& blens = params.bTensorDesc.GetLengths();
-                    const auto& clens = params.cTensorDesc.GetLengths();
-
-                    const auto& astrides = params.aTensorDesc.GetStrides();
-                    const auto& bstrides = params.bTensorDesc.GetStrides();
-                    const auto& cstrides = params.cTensorDesc.GetStrides();
-
                     if(packed_tensor)
                     { // OpTensorFwdBias
                         kernel(params.ATensor,
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index 11d33005b7..9792b21093 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -107,6 +107,21 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex
     const auto& bTensorDesc = problem.GetBTensorDesc();
     const auto& cTensorDesc = problem.GetCTensorDesc();
 
+    std::array<size_t, 4> clens;
+    std::tie(clens[0], clens[1], clens[2], clens[3]) = miopen::tien<4>(cTensorDesc.GetLengths());
+
+    std::array<size_t, 4> astrides;
+    std::array<size_t, 4> bstrides;
+    std::array<size_t, 4> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2], astrides[3]) =
+        miopen::tien<4>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3]) =
+        miopen::tien<4>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3]) =
+        miopen::tien<4>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
     int max_num_wg = 4096;
 
     auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
@@ -147,22 +162,24 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
     result.invoker_factory =
-        [work_per_wg, num_wg_orig, bitmap, packed_tensor](const std::vector<Kernel> kernels) {
+        [data_type,
+         clens,
+         astrides,
+         bstrides,
+         cstrides,
+         work_per_wg,
+         num_wg_orig,
+         bitmap,
+         packed_tensor](const std::vector<Kernel> kernels) {
             return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
                 decltype(auto) kernel = handle_.Run(kernels.front());
                 decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
 
-                visit_float(params.bTensorDesc.GetType(), [&](auto as_float) {
+                visit_float(data_type, [&](auto as_float) {
                     auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
                     auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
                     auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
-                    const auto& clens = params.cTensorDesc.GetLengths();
-
-                    const auto& astrides = params.aTensorDesc.GetStrides();
-                    const auto& bstrides = params.bTensorDesc.GetStrides();
-                    const auto& cstrides = params.cTensorDesc.GetStrides();
-
                     if(packed_tensor)
                     { // OpTensorLeadingOnes
                         kernel(params.ATensor,
diff --git a/src/tensor.cpp b/src/tensor.cpp
index f65a1a408e..c1bd709267 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -906,20 +906,8 @@ void OpTensor2(Handle& handle,
     const auto problem = tensorOp::ProblemDescription{
         tensorOp, beta, aTensorDesc, bTensorDesc, cTensorDesc, nonStandardSquash};
 
-    const auto invoke_params = tensorOp::InvokeParams{tensorOp,
-                                                      alpha0,
-                                                      aTensorDesc,
-                                                      ATensor,
-                                                      alpha1,
-                                                      bTensorDesc,
-                                                      BTensor,
-                                                      beta,
-                                                      cTensorDesc,
-                                                      CTensor,
-                                                      Aoffset,
-                                                      Boffset,
-                                                      Coffset,
-                                                      nonStandardSquash};
+    const auto invoke_params = tensorOp::InvokeParams{
+        alpha0, ATensor, alpha1, BTensor, beta, CTensor, Aoffset, Boffset, Coffset};
 
     const auto algo    = AlgorithmName{"TensorOpSolver"};
     const auto solvers = solver::SolverContainer<solver::tensorOp::OpTensorFwdBias>{} +
diff --git a/src/tensorOp/problem_description.cpp b/src/tensorOp/problem_description.cpp
index 4056fd3172..6053e7f1a0 100644
--- a/src/tensorOp/problem_description.cpp
+++ b/src/tensorOp/problem_description.cpp
@@ -34,28 +34,31 @@ namespace tensorOp {
 
 NetworkConfig ProblemDescription::MakeNetworkConfig() const
 {
-    std::ostringstream ss;
+    std::string ss;
 
-    auto alens = aTensorDesc.GetLengths();
-    auto blens = bTensorDesc.GetLengths();
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
 
-    auto astrides = aTensorDesc.GetStrides();
-    auto bstrides = bTensorDesc.GetStrides();
-    auto cstrides = cTensorDesc.GetStrides();
+    const auto& astrides = aTensorDesc.GetStrides();
+    const auto& bstrides = bTensorDesc.GetStrides();
+    const auto& cstrides = cTensorDesc.GetStrides();
 
-    auto printDims = [&ss](const auto& dim) {
-        for(uint32_t i = 0; i < dim.size(); i++)
+    auto printDims = [&ss, dims = alens.size() - 1](const auto& dim) {
+        for(uint32_t i = 0; i < dims; i++)
         {
-            ss << dim[i];
-            if(i != (dim.size() - 1))
-            {
-                ss << "x";
-            }
+            ss.append(std::to_string(dim[i]));
+            ss += 'x';
         }
-        ss << "-";
+        ss += std::to_string(dim.back());
+        ss += '-';
     };
 
-    ss << std::to_string(aTensorDesc.GetType()) << "-" << std::to_string(tensorOp) << "-";
+    ss.reserve(1024);
+    ss.append(std::string_view("TensorOp-"));
+    ss += std::to_string(aTensorDesc.GetType());
+    ss += '-';
+    ss += std::to_string(tensorOp);
+    ss += '-';
 
     printDims(alens);
     printDims(blens);
@@ -63,9 +66,9 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
     printDims(bstrides);
     printDims(cstrides);
 
-    ss << (float_equal(beta, 0.0f) ? "1" : "0");
+    ss += (float_equal(beta, 0.0f) ? '1' : '0');
 
-    return NetworkConfig{ss.str()};
+    return NetworkConfig(std::move(ss));
 }
 
 } // namespace tensorOp

From 496b414712ff764b8abfc73896b7b1e1b3835848 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Fri, 15 Nov 2024 19:38:35 +0200
Subject: [PATCH 15/25] clang format

---
 src/solver/tensorOp/Op1dTensorGeneric.cpp   |  96 +++++++-------
 src/solver/tensorOp/Op2dTensorLite.cpp      |  64 +++++-----
 src/solver/tensorOp/Op2dTensorSquash.cpp    |  56 ++++----
 src/solver/tensorOp/Op3dTensorGeneric.cpp   |  78 ++++++-----
 src/solver/tensorOp/Op4dTensorGeneric.cpp   |  88 ++++++-------
 src/solver/tensorOp/OpTensorFwdBias.cpp     | 131 ++++++++++---------
 src/solver/tensorOp/OpTensorLeadingOnes.cpp | 135 ++++++++++----------
 7 files changed, 315 insertions(+), 333 deletions(-)

diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index 7a2662e60b..c15b02ced7 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -112,55 +112,55 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result
-        .invoker_factory = [data_type, fit_into_int, b_n, c_n, a_nstrides, b_nstrides, c_nstrides](
-                               const std::vector<Kernel> kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
-
-            visit_float(data_type, [&](auto as_float) {
-                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
-                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
-                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
-
-                if(fit_into_int)
-                {
-                    kernel(params.ATensor,
-                           params.BTensor,
-                           params.CTensor,
-                           static_cast<uint32_t>(params.Aoffset),
-                           static_cast<uint32_t>(params.Boffset),
-                           static_cast<uint32_t>(params.Coffset),
-                           static_cast<uint32_t>(a_nstrides),
-                           static_cast<uint32_t>(b_n == 1 ? 0 : b_nstrides),
-                           static_cast<uint32_t>(c_nstrides),
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           static_cast<uint32_t>(c_n),
-                           !float_equal(miopen_beta, 0.0));
-                }
-                else
-                {
-                    kernel(params.ATensor,
-                           params.BTensor,
-                           params.CTensor,
-                           static_cast<uint64_t>(params.Aoffset),
-                           static_cast<uint64_t>(params.Boffset),
-                           static_cast<uint64_t>(params.Coffset),
-                           static_cast<uint64_t>(a_nstrides),
-                           static_cast<uint64_t>(b_n == 1 ? 0 : b_nstrides),
-                           static_cast<uint64_t>(c_nstrides),
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           static_cast<uint64_t>(c_n),
-                           !float_equal(miopen_beta, 0.0));
-                }
-            });
+    result.invoker_factory =
+        [data_type, fit_into_int, b_n, c_n, a_nstrides, b_nstrides, c_nstrides](
+            const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    if(fit_into_int)
+                    {
+                        kernel(params.ATensor,
+                               params.BTensor,
+                               params.CTensor,
+                               static_cast<uint32_t>(params.Aoffset),
+                               static_cast<uint32_t>(params.Boffset),
+                               static_cast<uint32_t>(params.Coffset),
+                               static_cast<uint32_t>(a_nstrides),
+                               static_cast<uint32_t>(b_n == 1 ? 0 : b_nstrides),
+                               static_cast<uint32_t>(c_nstrides),
+                               miopen_alpha0,
+                               miopen_alpha1,
+                               miopen_beta,
+                               static_cast<uint32_t>(c_n),
+                               !float_equal(miopen_beta, 0.0));
+                    }
+                    else
+                    {
+                        kernel(params.ATensor,
+                               params.BTensor,
+                               params.CTensor,
+                               static_cast<uint64_t>(params.Aoffset),
+                               static_cast<uint64_t>(params.Boffset),
+                               static_cast<uint64_t>(params.Coffset),
+                               static_cast<uint64_t>(a_nstrides),
+                               static_cast<uint64_t>(b_n == 1 ? 0 : b_nstrides),
+                               static_cast<uint64_t>(c_nstrides),
+                               miopen_alpha0,
+                               miopen_alpha1,
+                               miopen_beta,
+                               static_cast<uint64_t>(c_n),
+                               !float_equal(miopen_beta, 0.0));
+                    }
+                });
+            };
         };
-    };
     result.construction_params.push_back(kernel);
 
     return result;
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
index 696d73b073..2b7b030a2f 100644
--- a/src/solver/tensorOp/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -150,41 +150,37 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [data_type,
-                              b_c = blens[1],
-                              a_cstride,
-                              b_cstride,
-                              c_cstride,
-                              total_work,
-                              total_work2](const std::vector<Kernel> kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
-
-            visit_float(data_type, [&](auto as_float) {
-                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
-                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
-                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
-
-                kernel(params.ATensor,
-                       static_cast<int>(a_cstride),
-                       params.BTensor,
-                       static_cast<int>(b_cstride),
-                       params.CTensor,
-                       static_cast<int>(c_cstride),
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       static_cast<int64_t>(params.Aoffset),
-                       static_cast<int64_t>(params.Boffset),
-                       static_cast<int64_t>(params.Coffset),
-                       static_cast<int64_t>(total_work),
-                       static_cast<int64_t>(total_work2),
-                       static_cast<int>(!float_equal(miopen_beta, 0.0)),
-                       static_cast<int>(b_c == 1));
-            });
+    result.invoker_factory =
+        [data_type, b_c = blens[1], a_cstride, b_cstride, c_cstride, total_work, total_work2](
+            const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           static_cast<int>(a_cstride),
+                           params.BTensor,
+                           static_cast<int>(b_cstride),
+                           params.CTensor,
+                           static_cast<int>(c_cstride),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int64_t>(total_work),
+                           static_cast<int64_t>(total_work2),
+                           static_cast<int>(!float_equal(miopen_beta, 0.0)),
+                           static_cast<int>(b_c == 1));
+                });
+            };
         };
-    };
     result.construction_params.push_back(kernel);
 
     return result;
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
index 0f400eab2d..d6ca7cfa3b 100644
--- a/src/solver/tensorOp/Op2dTensorSquash.cpp
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -134,35 +134,35 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [data_type, b_c = blens[1], b_nstride, total_work](
-                                 const std::vector<Kernel> kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
-
-            visit_float(data_type, [&](auto as_float) {
-                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
-                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
-                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
-
-                kernel(params.ATensor,
-                       params.BTensor,
-                       static_cast<int>(b_c),
-                       static_cast<int>(b_nstride),
-                       params.CTensor,
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       static_cast<int64_t>(params.Aoffset),
-                       static_cast<int64_t>(params.Boffset),
-                       static_cast<int64_t>(params.Coffset),
-                       static_cast<int64_t>(total_work),
-                       static_cast<int>(!float_equal(miopen_alpha0, 0.0)),
-                       static_cast<int>(!float_equal(miopen_alpha1, 0.0)),
-                       static_cast<int>(!float_equal(miopen_beta, 0.0)));
-            });
+    result.invoker_factory =
+        [data_type, b_c = blens[1], b_nstride, total_work](const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           static_cast<int>(b_c),
+                           static_cast<int>(b_nstride),
+                           params.CTensor,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int64_t>(total_work),
+                           static_cast<int>(!float_equal(miopen_alpha0, 0.0)),
+                           static_cast<int>(!float_equal(miopen_alpha1, 0.0)),
+                           static_cast<int>(!float_equal(miopen_beta, 0.0)));
+                });
+            };
         };
-    };
     result.construction_params.push_back(kernel);
 
     return result;
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index f4cdb191cf..c03aec3f33 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -116,49 +116,43 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [data_type,
-                              blens,
-                              clens,
-                              astrides,
-                              bstrides,
-                              cstrides,
-                              bitmap,
-                              work_per_wg,
-                              num_wg_orig](const std::vector<Kernel> kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
-
-            visit_float(data_type, [&](auto as_float) {
-                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
-                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
-                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
-
-                kernel(params.ATensor,
-                       static_cast<int>(astrides[0]),
-                       static_cast<int>(astrides[1]),
-                       params.BTensor,
-                       static_cast<int>(blens[1]),
-                       static_cast<int>(blens[2]),
-                       static_cast<int>(bstrides[0]),
-                       static_cast<int>(bstrides[1]),
-                       params.CTensor,
-                       static_cast<int>(clens[1]),
-                       static_cast<int>(clens[2]),
-                       static_cast<int>(cstrides[0]),
-                       static_cast<int>(cstrides[1]),
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       bitmap,
-                       work_per_wg,
-                       static_cast<int64_t>(params.Aoffset),
-                       static_cast<int64_t>(params.Boffset),
-                       static_cast<int64_t>(params.Coffset),
-                       static_cast<int>(num_wg_orig));
-            });
+    result.invoker_factory =
+        [data_type, blens, clens, astrides, bstrides, cstrides, bitmap, work_per_wg, num_wg_orig](
+            const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           static_cast<int>(astrides[0]),
+                           static_cast<int>(astrides[1]),
+                           params.BTensor,
+                           static_cast<int>(blens[1]),
+                           static_cast<int>(blens[2]),
+                           static_cast<int>(bstrides[0]),
+                           static_cast<int>(bstrides[1]),
+                           params.CTensor,
+                           static_cast<int>(clens[1]),
+                           static_cast<int>(clens[2]),
+                           static_cast<int>(cstrides[0]),
+                           static_cast<int>(cstrides[1]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           bitmap,
+                           work_per_wg,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig));
+                });
+            };
         };
-    };
     result.construction_params.push_back(kernel);
 
     return result;
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
index 93b9b9847b..3c67a3411f 100644
--- a/src/solver/tensorOp/Op4dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -116,54 +116,48 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory = [data_type,
-                              blens,
-                              clens,
-                              astrides,
-                              bstrides,
-                              cstrides,
-                              work_per_wg,
-                              num_wg_orig,
-                              bitmap](const std::vector<Kernel> kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
-
-            visit_float(data_type, [&](auto as_float) {
-                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
-                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
-                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
-
-                kernel(params.ATensor,
-                       static_cast<int>(astrides[0]), // a_nstride,
-                       static_cast<int>(astrides[1]), // a_cstride,
-                       static_cast<int>(astrides[2]), // a_hstride,
-                       params.BTensor,
-                       static_cast<int>(blens[1]),    // b_c,
-                       static_cast<int>(blens[2]),    // b_h,
-                       static_cast<int>(blens[3]),    // b_w,
-                       static_cast<int>(bstrides[0]), // b_nstride,
-                       static_cast<int>(bstrides[1]), // b_cstride,
-                       static_cast<int>(bstrides[2]), // b_hstride,
-                       params.CTensor,
-                       static_cast<int>(clens[1]),    // c_c,
-                       static_cast<int>(clens[2]),    // c_h,
-                       static_cast<int>(clens[3]),    // c_w,
-                       static_cast<int>(cstrides[0]), // c_nstride,
-                       static_cast<int>(cstrides[1]), // c_cstride,
-                       static_cast<int>(cstrides[2]), // c_hstride,
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       bitmap,
-                       work_per_wg,
-                       static_cast<int64_t>(params.Aoffset),
-                       static_cast<int64_t>(params.Boffset),
-                       static_cast<int64_t>(params.Coffset),
-                       static_cast<int>(num_wg_orig));
-            });
+    result.invoker_factory =
+        [data_type, blens, clens, astrides, bstrides, cstrides, work_per_wg, num_wg_orig, bitmap](
+            const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           static_cast<int>(astrides[0]), // a_nstride,
+                           static_cast<int>(astrides[1]), // a_cstride,
+                           static_cast<int>(astrides[2]), // a_hstride,
+                           params.BTensor,
+                           static_cast<int>(blens[1]),    // b_c,
+                           static_cast<int>(blens[2]),    // b_h,
+                           static_cast<int>(blens[3]),    // b_w,
+                           static_cast<int>(bstrides[0]), // b_nstride,
+                           static_cast<int>(bstrides[1]), // b_cstride,
+                           static_cast<int>(bstrides[2]), // b_hstride,
+                           params.CTensor,
+                           static_cast<int>(clens[1]),    // c_c,
+                           static_cast<int>(clens[2]),    // c_h,
+                           static_cast<int>(clens[3]),    // c_w,
+                           static_cast<int>(cstrides[0]), // c_nstride,
+                           static_cast<int>(cstrides[1]), // c_cstride,
+                           static_cast<int>(cstrides[2]), // c_hstride,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           bitmap,
+                           work_per_wg,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig));
+                });
+            };
         };
-    };
     result.construction_params.push_back(kernel);
 
     return result;
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index 16e5d19d70..9df036df8c 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -146,73 +146,72 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory =
-        [data_type,
-         blens,
-         clens,
-         astrides,
-         bstrides,
-         cstrides,
-         work_per_wg,
-         num_wg_orig,
-         incr_wg,
-         packed_tensor](const std::vector<Kernel> kernels) {
-            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-                decltype(auto) kernel = handle_.Run(kernels.front());
-                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
-
-                visit_float(data_type, [&](auto as_float) {
-                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
-                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
-                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
-
-                    if(packed_tensor)
-                    { // OpTensorFwdBias
-                        kernel(params.ATensor,
-                               params.BTensor,
-                               static_cast<int>(blens[1]),
-                               params.CTensor,
-                               static_cast<int>(clens[0]),
-                               static_cast<int>(cstrides[0]),
-                               static_cast<int>(cstrides[1]),
-                               work_per_wg,
-                               miopen_alpha0,
-                               miopen_alpha1,
-                               miopen_beta,
-                               static_cast<int64_t>(params.Aoffset),
-                               static_cast<int64_t>(params.Boffset),
-                               static_cast<int64_t>(params.Coffset),
-                               static_cast<int>(num_wg_orig),
-                               static_cast<int>(incr_wg));
-                    }
-                    else
-                    { // OpTensorFwdBiasGeneric
-                        kernel(params.ATensor,
-                               static_cast<int>(astrides[0]),
-                               static_cast<int>(astrides[1]),
-                               static_cast<int>(astrides[2]),
-                               params.BTensor,
-                               static_cast<int>(blens[1]),
-                               static_cast<int>(bstrides[1]),
-                               params.CTensor,
-                               static_cast<int>(clens[0]),
-                               static_cast<int>(clens[3]),
-                               static_cast<int>(cstrides[0]),
-                               static_cast<int>(cstrides[1]),
-                               static_cast<int>(cstrides[2]),
-                               miopen_alpha0,
-                               miopen_alpha1,
-                               miopen_beta,
-                               work_per_wg,
-                               static_cast<int64_t>(params.Aoffset),
-                               static_cast<int64_t>(params.Boffset),
-                               static_cast<int64_t>(params.Coffset),
-                               static_cast<int>(num_wg_orig),
-                               static_cast<int>(incr_wg));
-                    }
-                });
-            };
+    result.invoker_factory = [data_type,
+                              blens,
+                              clens,
+                              astrides,
+                              bstrides,
+                              cstrides,
+                              work_per_wg,
+                              num_wg_orig,
+                              incr_wg,
+                              packed_tensor](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(data_type, [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                if(packed_tensor)
+                { // OpTensorFwdBias
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           static_cast<int>(blens[1]),
+                           params.CTensor,
+                           static_cast<int>(clens[0]),
+                           static_cast<int>(cstrides[0]),
+                           static_cast<int>(cstrides[1]),
+                           work_per_wg,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig),
+                           static_cast<int>(incr_wg));
+                }
+                else
+                { // OpTensorFwdBiasGeneric
+                    kernel(params.ATensor,
+                           static_cast<int>(astrides[0]),
+                           static_cast<int>(astrides[1]),
+                           static_cast<int>(astrides[2]),
+                           params.BTensor,
+                           static_cast<int>(blens[1]),
+                           static_cast<int>(bstrides[1]),
+                           params.CTensor,
+                           static_cast<int>(clens[0]),
+                           static_cast<int>(clens[3]),
+                           static_cast<int>(cstrides[0]),
+                           static_cast<int>(cstrides[1]),
+                           static_cast<int>(cstrides[2]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           work_per_wg,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig),
+                           static_cast<int>(incr_wg));
+                }
+            });
         };
+    };
     result.construction_params.push_back(kernel);
 
     return result;
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index 9792b21093..d930da0da6 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -161,75 +161,74 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex
     kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
-    result.invoker_factory =
-        [data_type,
-         clens,
-         astrides,
-         bstrides,
-         cstrides,
-         work_per_wg,
-         num_wg_orig,
-         bitmap,
-         packed_tensor](const std::vector<Kernel> kernels) {
-            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-                decltype(auto) kernel = handle_.Run(kernels.front());
-                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
-
-                visit_float(data_type, [&](auto as_float) {
-                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
-                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
-                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
-
-                    if(packed_tensor)
-                    { // OpTensorLeadingOnes
-                        kernel(params.ATensor,
-                               params.BTensor,
-                               params.CTensor,
-                               static_cast<int>(clens[1]),
-                               static_cast<int>(clens[2]),
-                               static_cast<int>(clens[3]),
-                               static_cast<int>(cstrides[0]),
-                               static_cast<int>(cstrides[1]),
-                               work_per_wg,
-                               miopen_alpha0,
-                               miopen_alpha1,
-                               miopen_beta,
-                               static_cast<int64_t>(params.Aoffset),
-                               static_cast<int64_t>(params.Boffset),
-                               static_cast<int64_t>(params.Coffset),
-                               static_cast<int>(num_wg_orig),
-                               bitmap);
-                    }
-                    else
-                    { // OpTensorLeadingOnesGeneric
-                        kernel(params.ATensor,
-                               static_cast<int>(astrides[0]),
-                               static_cast<int>(astrides[1]),
-                               static_cast<int>(astrides[2]),
-                               params.BTensor,
-                               static_cast<int>(bstrides[0]),
-                               static_cast<int>(bstrides[1]),
-                               static_cast<int>(bstrides[2]),
-                               params.CTensor,
-                               static_cast<int>(clens[1]),
-                               static_cast<int>(clens[2]),
-                               static_cast<int>(clens[3]),
-                               static_cast<int>(cstrides[0]),
-                               static_cast<int>(cstrides[1]),
-                               static_cast<int>(cstrides[2]),
-                               miopen_alpha0,
-                               miopen_alpha1,
-                               miopen_beta,
-                               work_per_wg,
-                               static_cast<int64_t>(params.Aoffset),
-                               static_cast<int64_t>(params.Boffset),
-                               static_cast<int64_t>(params.Coffset),
-                               static_cast<int>(num_wg_orig),
-                               bitmap);
-                    }
-                });
-            };
+    result.invoker_factory = [data_type,
+                              clens,
+                              astrides,
+                              bstrides,
+                              cstrides,
+                              work_per_wg,
+                              num_wg_orig,
+                              bitmap,
+                              packed_tensor](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(data_type, [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                if(packed_tensor)
+                { // OpTensorLeadingOnes
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           params.CTensor,
+                           static_cast<int>(clens[1]),
+                           static_cast<int>(clens[2]),
+                           static_cast<int>(clens[3]),
+                           static_cast<int>(cstrides[0]),
+                           static_cast<int>(cstrides[1]),
+                           work_per_wg,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig),
+                           bitmap);
+                }
+                else
+                { // OpTensorLeadingOnesGeneric
+                    kernel(params.ATensor,
+                           static_cast<int>(astrides[0]),
+                           static_cast<int>(astrides[1]),
+                           static_cast<int>(astrides[2]),
+                           params.BTensor,
+                           static_cast<int>(bstrides[0]),
+                           static_cast<int>(bstrides[1]),
+                           static_cast<int>(bstrides[2]),
+                           params.CTensor,
+                           static_cast<int>(clens[1]),
+                           static_cast<int>(clens[2]),
+                           static_cast<int>(clens[3]),
+                           static_cast<int>(cstrides[0]),
+                           static_cast<int>(cstrides[1]),
+                           static_cast<int>(cstrides[2]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           work_per_wg,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig),
+                           bitmap);
+                }
+            });
         };
+    };
     result.construction_params.push_back(kernel);
 
     return result;

From cb6fd6e956ca32bbd14331bff72b95a074dab626 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Mon, 18 Nov 2024 10:18:27 +0200
Subject: [PATCH 16/25] change for new Op3dTensorGeneric kernel usage

---
 src/solver/tensorOp/Op1dTensorGeneric.cpp |  6 ++--
 src/solver/tensorOp/Op2dTensorGeneric.cpp |  2 +-
 src/solver/tensorOp/Op3dTensorGeneric.cpp | 42 +++++++++++------------
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index c15b02ced7..896d75d50c 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -129,9 +129,9 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
                         kernel(params.ATensor,
                                params.BTensor,
                                params.CTensor,
-                               static_cast<uint32_t>(params.Aoffset),
-                               static_cast<uint32_t>(params.Boffset),
-                               static_cast<uint32_t>(params.Coffset),
+                               static_cast<uint64_t>(params.Aoffset),
+                               static_cast<uint64_t>(params.Boffset),
+                               static_cast<uint64_t>(params.Coffset),
                                static_cast<uint32_t>(a_nstrides),
                                static_cast<uint32_t>(b_n == 1 ? 0 : b_nstrides),
                                static_cast<uint32_t>(c_nstrides),
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index 917eee17c2..41fca78068 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -101,7 +101,7 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false);
+    GetCommonParams(build_params, problem, true);
 
     build_params.Define("USE_2D_TENSOR_GENERIC");
 
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index c03aec3f33..c2c7212646 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -103,12 +103,11 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_3D_TENSOR_GENERIC");
-    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
 
     auto kernel = KernelInfo{};
 
-    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
-    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+    kernel.kernel_file  = "MIOpenTensorKernelsHip.cpp";
     kernel.kernel_name  = "Op3dTensorGeneric";
 
     using std::begin, std::end;
@@ -117,8 +116,7 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
 
     result.invoker_factory =
-        [data_type, blens, clens, astrides, bstrides, cstrides, bitmap, work_per_wg, num_wg_orig](
-            const std::vector<Kernel> kernels) {
+        [data_type, blens, clens, astrides, bstrides, cstrides](const std::vector<Kernel> kernels) {
             return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
                 decltype(auto) kernel = handle_.Run(kernels.front());
                 decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
@@ -129,27 +127,29 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
                     auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
 
                     kernel(params.ATensor,
-                           static_cast<int>(astrides[0]),
-                           static_cast<int>(astrides[1]),
                            params.BTensor,
-                           static_cast<int>(blens[1]),
-                           static_cast<int>(blens[2]),
-                           static_cast<int>(bstrides[0]),
-                           static_cast<int>(bstrides[1]),
                            params.CTensor,
-                           static_cast<int>(clens[1]),
-                           static_cast<int>(clens[2]),
-                           static_cast<int>(cstrides[0]),
-                           static_cast<int>(cstrides[1]),
+                           static_cast<uint64_t>(params.Aoffset),
+                           static_cast<uint64_t>(params.Boffset),
+                           static_cast<uint64_t>(params.Coffset),
+                           static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]), // b_c,
+                           static_cast<uint32_t>(blens[2] == 1 ? clens[2] : blens[2]), // b_h,
+                           static_cast<uint32_t>(clens[1]),                            // c_c,
+                           static_cast<uint32_t>(clens[2]),                            // c_h,
+                           static_cast<uint32_t>(astrides[0]),                         // a_nstride,
+                           static_cast<uint32_t>(astrides[1]),                         // a_cstride,
+                           static_cast<uint32_t>(astrides[2]),                         // a_hstride,
+                           static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),     // b_nstride,
+                           static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),     // b_cstride,
+                           static_cast<uint32_t>(blens[2] == 1 ? 0 : bstrides[2]),     // b_hstride,
+                           static_cast<uint32_t>(cstrides[0]),                         // c_nstride,
+                           static_cast<uint32_t>(cstrides[1]),                         // c_cstride,
+                           static_cast<uint32_t>(cstrides[2]),                         // c_hstride,
                            miopen_alpha0,
                            miopen_alpha1,
                            miopen_beta,
-                           bitmap,
-                           work_per_wg,
-                           static_cast<int64_t>(params.Aoffset),
-                           static_cast<int64_t>(params.Boffset),
-                           static_cast<int64_t>(params.Coffset),
-                           static_cast<int>(num_wg_orig));
+                           static_cast<uint32_t>(clens[0]),
+                           !float_equal(miopen_beta, 0.0));
                 });
             };
         };

From 6c3d0c2d7e87bb98e5023b1d78ab686ad2eebaf4 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Mon, 18 Nov 2024 15:34:53 +0200
Subject: [PATCH 17/25] remove unused variable

---
 src/solver/tensorOp/Op3dTensorGeneric.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index c2c7212646..3bef77fa29 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -88,7 +88,6 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
-    int num_wg_orig = num_wg;
     int max_num_wg  = 4096;
     num_wg          = num_wg > max_num_wg ? max_num_wg : num_wg;
 

From bd0bd61c78ecf7a351f20486c9ec4f217f85e7a0 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Mon, 18 Nov 2024 15:36:19 +0200
Subject: [PATCH 18/25] clang format

---
 src/solver/tensorOp/Op3dTensorGeneric.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index 3bef77fa29..2bafc6abaa 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -88,8 +88,8 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
 
-    int max_num_wg  = 4096;
-    num_wg          = num_wg > max_num_wg ? max_num_wg : num_wg;
+    int max_num_wg = 4096;
+    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
 
     size_t local_threads  = 256;
     size_t global_threads = num_wg * local_threads;

From 3f14d3aa03fefc471a7498ba1992757bd08cc0d2 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Tue, 19 Nov 2024 10:00:35 +0200
Subject: [PATCH 19/25] support for half data type for CL kernels

---
 src/kernels/MIOpenTensorKernels.cl          | 17 +++++++
 src/solver/tensorOp/Op1dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/Op2dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/Op2dTensorLite.cpp      |  2 +-
 src/solver/tensorOp/Op2dTensorSquash.cpp    |  2 +-
 src/solver/tensorOp/Op3dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/Op4dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/Op4dTensorLite.cpp      |  2 +-
 src/solver/tensorOp/Op5dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/OpTensorFwdBias.cpp     |  2 +-
 src/solver/tensorOp/OpTensorLeadingOnes.cpp |  2 +-
 src/solver/tensorOp/tensor_op_helpers.hpp   | 50 ++++++++++++++++++++-
 12 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/src/kernels/MIOpenTensorKernels.cl b/src/kernels/MIOpenTensorKernels.cl
index 842d3d4d6b..fce43b78f3 100644
--- a/src/kernels/MIOpenTensorKernels.cl
+++ b/src/kernels/MIOpenTensorKernels.cl
@@ -23,6 +23,23 @@
  * SOFTWARE.
  *
  *******************************************************************************/
+#if MIOPEN_USE_FP16 == 1
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define _FLOAT half
+#ifndef HALF_MAX
+#define MAX_VAL 65504 /* max value */
+#else
+#define MAX_VAL HALF_MAX
+#endif
+#endif
+#if MIOPEN_USE_FP32 == 1
+#define _FLOAT float
+#ifndef FLT_MAX
+#define MAX_VAL 3.402823466e+38F /* max value */
+#else
+#define MAX_VAL FLT_MAX
+#endif
+#endif
 
 /* Only works for NCHW
  * bitmap tracks which dims are the same between 'a' and 'c'.
diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index 896d75d50c..9abf49b912 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -97,7 +97,7 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, true);
+    GetCommonParams(build_params, problem, false, true);
 
     build_params.Define("USE_1D_TENSOR_GENERIC");
 
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index 41fca78068..03045d69ae 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -101,7 +101,7 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, true);
+    GetCommonParams(build_params, problem, false, true);
 
     build_params.Define("USE_2D_TENSOR_GENERIC");
 
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
index 2b7b030a2f..e070354bfe 100644
--- a/src/solver/tensorOp/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -133,7 +133,7 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false);
+    GetCommonParams(build_params, problem, true, false);
 
     build_params.Define("USE_2D_TENSOR_LITE");
     build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
index d6ca7cfa3b..40876e9d3c 100644
--- a/src/solver/tensorOp/Op2dTensorSquash.cpp
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -117,7 +117,7 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false);
+    GetCommonParams(build_params, problem, true, false);
 
     build_params.Define("USE_2D_TENSOR_SQUASH");
     build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index 2bafc6abaa..c8a5f9dd1a 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -99,7 +99,7 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false);
+    GetCommonParams(build_params, problem, false, false);
 
     build_params.Define("USE_3D_TENSOR_GENERIC");
 
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
index 3c67a3411f..c28a094741 100644
--- a/src/solver/tensorOp/Op4dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -101,7 +101,7 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false);
+    GetCommonParams(build_params, problem, true, false);
 
     build_params.Define("USE_4D_TENSOR_GENERIC");
     build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp
index a53174507e..fa9d45f0e5 100644
--- a/src/solver/tensorOp/Op4dTensorLite.cpp
+++ b/src/solver/tensorOp/Op4dTensorLite.cpp
@@ -112,7 +112,7 @@ ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false);
+    GetCommonParams(build_params, problem, true, false);
 
     build_params.Define("USE_4D_TENSOR_LITE");
     build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp
index 35ef705f5b..bcf6d66773 100644
--- a/src/solver/tensorOp/Op5dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp
@@ -104,7 +104,7 @@ Op5dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false);
+    GetCommonParams(build_params, problem, true, false);
 
     build_params.Define("USE_5D_TENSOR_GENERIC");
     build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index 9df036df8c..6d8d8139f1 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -121,7 +121,7 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false);
+    GetCommonParams(build_params, problem, true, false);
 
     build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
 
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index d930da0da6..37daada086 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -137,7 +137,7 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false);
+    GetCommonParams(build_params, problem, true, false);
 
     build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
     auto kernel = KernelInfo{};
diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp
index 46ce39e4a0..d902914368 100644
--- a/src/solver/tensorOp/tensor_op_helpers.hpp
+++ b/src/solver/tensorOp/tensor_op_helpers.hpp
@@ -39,9 +39,57 @@ namespace tensorOp {
 
 inline void GetCommonParams(KernelBuildParameters& build_params,
                             const miopen::tensorOp::ProblemDescription& problem,
+                            bool isCLKernel,
                             bool is64bSupported)
 {
-    build_params.Define("MIOPEN_TYPE", miopen::GetDataType(problem.GetBTensorDesc().GetType()));
+    miopenDataType_t data_type = problem.GetBTensorDesc().GetType();
+
+    if(isCLKernel)
+    { // values for MIOPEN_USE_ macros
+        int use_fp16               = 0;
+        int use_fp16x4             = 0;
+        int use_fp16x8             = 0;
+        int use_fp32               = 0;
+        int use_int8               = 0;
+        int use_int32              = 0;
+        int use_bfp16              = 0;
+        int use_fp64               = 0;
+        int use_fp8                = 0;
+        int use_bfp8               = 0;
+        const int use_rne_bfloat16 = MIOPEN_USE_RNE_BFLOAT16;
+
+        switch(data_type)
+        {
+        case miopenHalf: use_fp16 = 1; break;
+        case miopenFloat: use_fp32 = 1; break;
+        case miopenInt8: use_int8 = 1; break;
+        case miopenBFloat16: use_bfp16 = 1; break;
+        case miopenInt32: use_int32 = 1; break;
+        case miopenDouble: use_fp64 = 1; break;
+        case miopenFloat8: use_fp8 = 1; break;
+        case miopenBFloat8: use_bfp8 = 1; break;
+        default: MIOPEN_THROW("Unsupported data type."); break;
+        }
+
+        build_params.Define("MIOPEN_USE_FP16", use_fp16);
+        build_params.Define("MIOPEN_USE_FP16x4", use_fp16x4);
+        build_params.Define("MIOPEN_USE_FP16x8", use_fp16x8);
+        build_params.Define("MIOPEN_USE_FP32", use_fp32);
+        build_params.Define("MIOPEN_USE_INT8", use_int8);
+        build_params.Define("MIOPEN_USE_BFP16", use_bfp16);
+        build_params.Define("MIOPEN_USE_INT32", use_int32);
+        build_params.Define("MIOPEN_USE_RNE_BFLOAT16", use_rne_bfloat16);
+        build_params.Define("MIOPEN_FP8_IEEE_EXPONENT_BIAS", MIOPEN_FP8_IEEE_EXPONENT_BIAS);
+        build_params.Define("MIOPEN_FP8_CLIPPING", MIOPEN_FP8_CLIPPING);
+        if(use_fp64 != 0)
+            build_params.Define("MIOPEN_USE_FP64", use_fp64);
+        if(use_fp8 != 0)
+            build_params.Define("MIOPEN_USE_FP8", use_fp8);
+        if(use_bfp8 != 0)
+            build_params.Define("MIOPEN_USE_BFP8", use_bfp8);
+    }
+
+    build_params.Define("MIOPEN_TYPE", miopen::GetDataType(data_type));
 
     switch(problem.GetTensorOp())
     {

From 042129eb3270929d2d0c402119510c47081918ae Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Tue, 19 Nov 2024 15:12:24 +0200
Subject: [PATCH 20/25] additional changes for support for half type

---
 src/kernels/MIOpenTensorKernels.cl          | 16 -------
 src/solver/tensorOp/Op1dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/Op2dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/Op2dTensorLite.cpp      |  2 +-
 src/solver/tensorOp/Op2dTensorSquash.cpp    |  2 +-
 src/solver/tensorOp/Op3dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/Op4dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/Op4dTensorLite.cpp      |  2 +-
 src/solver/tensorOp/Op5dTensorGeneric.cpp   |  2 +-
 src/solver/tensorOp/OpTensorFwdBias.cpp     |  2 +-
 src/solver/tensorOp/OpTensorLeadingOnes.cpp |  2 +-
 src/solver/tensorOp/tensor_op_helpers.hpp   | 46 ---------------------
 12 files changed, 10 insertions(+), 72 deletions(-)

diff --git a/src/kernels/MIOpenTensorKernels.cl b/src/kernels/MIOpenTensorKernels.cl
index fce43b78f3..8203dad1f0 100644
--- a/src/kernels/MIOpenTensorKernels.cl
+++ b/src/kernels/MIOpenTensorKernels.cl
@@ -23,23 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#if MIOPEN_USE_FP16 == 1
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define _FLOAT half
-#ifndef HALF_MAX
-#define MAX_VAL 65504 /* max value */
-#else
-#define MAX_VAL HALF_MAX
-#endif
-#endif
-#if MIOPEN_USE_FP32 == 1
-#define _FLOAT float
-#ifndef FLT_MAX
-#define MAX_VAL 3.402823466e+38F /* max value */
-#else
-#define MAX_VAL FLT_MAX
-#endif
-#endif
 
 /* Only works for NCHW
  * bitmap tracks which dims are the same between 'a' and 'c'.
diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index 9abf49b912..896d75d50c 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -97,7 +97,7 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false, true);
+    GetCommonParams(build_params, problem, true);
 
     build_params.Define("USE_1D_TENSOR_GENERIC");
 
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index 03045d69ae..41fca78068 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -101,7 +101,7 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false, true);
+    GetCommonParams(build_params, problem, true);
 
     build_params.Define("USE_2D_TENSOR_GENERIC");
 
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
index e070354bfe..2b7b030a2f 100644
--- a/src/solver/tensorOp/Op2dTensorLite.cpp
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -133,7 +133,7 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, true, false);
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_2D_TENSOR_LITE");
     build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
index 40876e9d3c..d6ca7cfa3b 100644
--- a/src/solver/tensorOp/Op2dTensorSquash.cpp
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -117,7 +117,7 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, true, false);
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_2D_TENSOR_SQUASH");
     build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index c8a5f9dd1a..2bafc6abaa 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -99,7 +99,7 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, false, false);
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_3D_TENSOR_GENERIC");
 
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
index c28a094741..3c67a3411f 100644
--- a/src/solver/tensorOp/Op4dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -101,7 +101,7 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, true, false);
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_4D_TENSOR_GENERIC");
     build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp
index fa9d45f0e5..a53174507e 100644
--- a/src/solver/tensorOp/Op4dTensorLite.cpp
+++ b/src/solver/tensorOp/Op4dTensorLite.cpp
@@ -112,7 +112,7 @@ ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, true, false);
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_4D_TENSOR_LITE");
     build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp
index bcf6d66773..35ef705f5b 100644
--- a/src/solver/tensorOp/Op5dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp
@@ -104,7 +104,7 @@ Op5dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, true, false);
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("USE_5D_TENSOR_GENERIC");
     build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
index 6d8d8139f1..9df036df8c 100644
--- a/src/solver/tensorOp/OpTensorFwdBias.cpp
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -121,7 +121,7 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, true, false);
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
 
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
index 37daada086..d930da0da6 100644
--- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -137,7 +137,7 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex
 
     KernelBuildParameters build_params = KernelBuildParameters{};
 
-    GetCommonParams(build_params, problem, true, false);
+    GetCommonParams(build_params, problem, false);
 
     build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
     auto kernel = KernelInfo{};
diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp
index d902914368..26a9ac42d0 100644
--- a/src/solver/tensorOp/tensor_op_helpers.hpp
+++ b/src/solver/tensorOp/tensor_op_helpers.hpp
@@ -39,56 +39,10 @@ namespace tensorOp {
 
 inline void GetCommonParams(KernelBuildParameters& build_params,
                             const miopen::tensorOp::ProblemDescription& problem,
-                            bool isCLKernel,
                             bool is64bSupported)
 {
     miopenDataType_t data_type = problem.GetBTensorDesc().GetType();
 
-    if(isCLKernel)
-    { // values for MIOPEN_USE_ macros
-        int use_fp16               = 0;
-        int use_fp16x4             = 0;
-        int use_fp16x8             = 0;
-        int use_fp32               = 0;
-        int use_int8               = 0;
-        int use_int32              = 0;
-        int use_bfp16              = 0;
-        int use_fp64               = 0;
-        int use_fp8                = 0;
-        int use_bfp8               = 0;
-        const int use_rne_bfloat16 = MIOPEN_USE_RNE_BFLOAT16;
-
-        switch(data_type)
-        {
-        case miopenHalf: use_fp16 = 1; break;
-        case miopenFloat: use_fp32 = 1; break;
-        case miopenInt8: use_int8 = 1; break;
-        case miopenBFloat16: use_bfp16 = 1; break;
-        case miopenInt32: use_int32 = 1; break;
-        case miopenDouble: use_fp64 = 1; break;
-        case miopenFloat8: use_fp8 = 1; break;
-        case miopenBFloat8: use_bfp8 = 1; break;
-        default: MIOPEN_THROW("Unsupported data type."); break;
-        }
-
-        build_params.Define("MIOPEN_USE_FP16", use_fp16);
-        build_params.Define("MIOPEN_USE_FP16x4", use_fp16x4);
-        build_params.Define("MIOPEN_USE_FP16x8", use_fp16x8);
-        build_params.Define("MIOPEN_USE_FP32", use_fp32);
-        build_params.Define("MIOPEN_USE_INT8", use_int8);
-        build_params.Define("MIOPEN_USE_BFP16", use_bfp16);
-        build_params.Define("MIOPEN_USE_INT32", use_int32);
-        build_params.Define("MIOPEN_USE_RNE_BFLOAT16", use_rne_bfloat16);
-        build_params.Define("MIOPEN_FP8_IEEE_EXPONENT_BIAS", MIOPEN_FP8_IEEE_EXPONENT_BIAS);
-        build_params.Define("MIOPEN_FP8_CLIPPING", MIOPEN_FP8_CLIPPING);
-        if(use_fp64 != 0)
-            build_params.Define("MIOPEN_USE_FP64", use_fp64);
-        if(use_fp8 != 0)
-            build_params.Define("MIOPEN_USE_FP8", use_fp8);
-        if(use_bfp8 != 0)
-            build_params.Define("MIOPEN_USE_BFP8", use_bfp8);
-    }
-
     build_params.Define("MIOPEN_TYPE", miopen::GetDataType(data_type));
 
     switch(problem.GetTensorOp())

From 371d43c4781535ea71b75ee0f9981b8576f35754 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Wed, 20 Nov 2024 18:29:56 +0200
Subject: [PATCH 21/25] initial removal of tensorocl.cpp

---
 src/CMakeLists.txt                            |    1 -
 src/include/miopen/rnn/solvers.hpp            |   23 +-
 src/include/miopen/tensor_ops.hpp             |   18 +-
 src/ocl/tensorocl.cpp                         | 2617 -----------------
 src/rnn/Solutions/Base/bw_data_modular.cpp    |    4 +-
 src/rnn/Solutions/Base/bw_weights_modular.cpp |    4 +-
 src/rnn/Solutions/Base/fw_data_modular.cpp    |    7 +-
 src/rnn/Solutions/bwd_multi_stream.cpp        |    2 +-
 src/rnn/Solutions/bww_multi_stream.cpp        |    2 +-
 src/rnn/Solutions/bww_s_steam.cpp             |    2 +-
 src/tensor.cpp                                | 1178 +++++++-
 test/tensor_ops.cpp                           |   36 +-
 12 files changed, 1202 insertions(+), 2692 deletions(-)
 delete mode 100644 src/ocl/tensorocl.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4f1096001a..0721efc4f3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -696,7 +696,6 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ocl/lrn_ocl.cpp
         ocl/mloNorm.cpp
         ocl/pooling_ocl.cpp
-        ocl/tensorocl.cpp
         ocl/rnnocl.cpp
         ocl/utilocl.cpp
         ocl/ctcocl.cpp
diff --git a/src/include/miopen/rnn/solvers.hpp b/src/include/miopen/rnn/solvers.hpp
index 429bcee752..908c3def65 100644
--- a/src/include/miopen/rnn/solvers.hpp
+++ b/src/include/miopen/rnn/solvers.hpp
@@ -171,9 +171,9 @@ class RNNForwardDataModularAlgo : RNNModuleAlgoBase
     // base API
     void PrepareWriteBuffers(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const;
 
-    void PropX(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const;
+    void PropX(Handle& handle, const runtimeArgsFwd& runtimeArgs) const;
 
-    void AddBias(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const;
+    void AddBias(Handle& handle, const runtimeArgsFwd& runtimeArgs) const;
     void PropHxCx(const Handle& handle,
                   const runtimeArgsFwd& runtimeArgs,
                   unsigned int layer,
@@ -206,7 +206,7 @@ class RNNForwardDataModularAlgo : RNNModuleAlgoBase
     void PropY(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const;
 
     // ext API
-    void PropX(const Handle& handle,
+    void PropX(Handle& handle,
                const runtimeArgsFwd& runtimeArgs,
                size_t gemm_batch_offset,
                size_t gemm_batch_size) const;
@@ -340,7 +340,7 @@ class RNNBackwardDataModularAlgo : RNNModuleAlgoBase
 public:
     void PrepareWriteBuffers(const Handle& handle, Data_t dhx, Data_t dcx, Data_t workSpace) const;
 
-    void PropDhy(const Handle& handle,
+    void PropDhy(Handle& handle,
                  ConstData_t dhy,
                  Data_t workSpace,
                  unsigned int layer,
@@ -364,7 +364,7 @@ class RNNBackwardDataModularAlgo : RNNModuleAlgoBase
                                 const SequenceIterator& seq,
                                 SequenceDirection direction) const;
 
-    void PropDhxDcx(const Handle& handle,
+    void PropDhxDcx(Handle& handle,
                     ConstData_t w,
                     Data_t dhx,
                     Data_t dcx,
@@ -625,7 +625,7 @@ class RNNModularMultiStreamBWD
 
     struct runtimeArgsBwd
     {
-        const Handle* handle;
+        Handle* handle;
         ConstData_t dy;
         ConstData_t dhy;
         Data_t dhx;
@@ -728,11 +728,8 @@ class RNNBackwardWeightsModularAlgo
                              ConstData_t reserveSpace,
                              size_t layer) const;
 
-    void BiasUpdate(const Handle& handle,
-                    Data_t dw,
-                    Data_t workSpace,
-                    size_t layer,
-                    size_t workSpaceSize) const;
+    void BiasUpdate(
+        Handle& handle, Data_t dw, Data_t workSpace, size_t layer, size_t workSpaceSize) const;
 
     void HiddenHStateWeights(const Handle& handle,
                              Data_t dw,
@@ -1027,7 +1024,7 @@ class RNNModularSingleStreamBWWeights
     // TODO
     static size_t GetWsSize() { return 0; };
 
-    void Compute(const Handle& handle,
+    void Compute(Handle& handle,
                  ConstData_t x,
                  ConstData_t hx,
                  Data_t dw,
@@ -1076,7 +1073,7 @@ class RNNModularMultiStreamBWWeights
         ConstData_t reserveSpace;
     };
 
-    void Compute(const Handle& handle,
+    void Compute(Handle& handle,
                  ConstData_t x,
                  ConstData_t hx,
                  Data_t dw,
diff --git a/src/include/miopen/tensor_ops.hpp b/src/include/miopen/tensor_ops.hpp
index c19eb333f2..a344eb9dbc 100644
--- a/src/include/miopen/tensor_ops.hpp
+++ b/src/include/miopen/tensor_ops.hpp
@@ -173,7 +173,7 @@ MIOPEN_INTERNALS_EXPORT void SetTensor(const Handle& handle,
                                        const void* alpha,
                                        int offset = 0);
 
-MIOPEN_INTERNALS_EXPORT void OpTensor(const Handle& handle,
+MIOPEN_INTERNALS_EXPORT void OpTensor(Handle& handle,
                                       miopenTensorOp_t tensorOp,
                                       const void* alpha0,
                                       const TensorDescriptor& aTensorDesc,
@@ -189,22 +189,6 @@ MIOPEN_INTERNALS_EXPORT void OpTensor(const Handle& handle,
                                       size_t Coffset         = 0,
                                       bool nonStandardSquash = false);
 
-MIOPEN_INTERNALS_EXPORT void OpTensor2(Handle& handle,
-                                       miopenTensorOp_t tensorOp,
-                                       const void* alpha0,
-                                       const TensorDescriptor& aTensorDesc,
-                                       ConstData_t ATensor,
-                                       const void* alpha1,
-                                       const TensorDescriptor& bTensorDesc,
-                                       ConstData_t BTensor,
-                                       const void* beta,
-                                       const TensorDescriptor& cTensorDesc,
-                                       Data_t CTensor,
-                                       size_t Aoffset         = 0,
-                                       size_t Boffset         = 0,
-                                       size_t Coffset         = 0,
-                                       bool nonStandardSquash = false);
-
 MIOPEN_INTERNALS_EXPORT void CopyTensor(const Handle& handle,
                                         const TensorDescriptor& srcDesc,
                                         ConstData_t src,
diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp
deleted file mode 100644
index 985861c09f..0000000000
--- a/src/ocl/tensorocl.cpp
+++ /dev/null
@@ -1,2617 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <miopen/tensor.hpp>
-#include <miopen/errors.hpp>
-#include <miopen/float_equal.hpp>
-#include <miopen/handle.hpp>
-#include <miopen/tensor_ops.hpp>
-#include <miopen/datatype.hpp>
-#include <miopen/visit_float.hpp>
-#include <miopen/util.hpp>
-#include <miopen/logger.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <boost/range/combine.hpp>
-
-#define MIO_TENSOROCL_DEBUG 0
-
-namespace miopen {
-
-TensorDescriptor GetFlattenedTensorDescriptor(const TensorDescriptor& desc)
-{
-    // is packed
-    if(desc.IsPacked())
-        return {desc.GetType(), {desc.GetElementSize()}, {static_cast<std::size_t>(1)}};
-
-    // start flattening tensor
-    std::vector<std::size_t> flat_lengths;
-    std::vector<std::size_t> flat_strides;
-
-    auto non1_length_strides = boost::combine(desc.GetLengths(), desc.GetStrides()) |
-                               boost::adaptors::filtered(f_length_is_not_1_t());
-
-    auto i               = non1_length_strides.begin();
-    std::size_t flat_len = boost::get<0>(*i);
-    auto i_previous      = i++;
-
-    // the 0-th dimension full-length doesn't matter
-    for(; i != non1_length_strides.end(); ++i)
-    {
-        std::size_t len             = boost::get<0>(*i);
-        std::size_t stride          = boost::get<1>(*i);
-        std::size_t previous_stride = boost::get<1>(*i_previous);
-        std::size_t full_len        = previous_stride / stride;
-
-        if(len == full_len)
-        {
-            flat_len *= len;
-        }
-        else
-        {
-            flat_lengths.push_back(flat_len);
-            flat_strides.push_back(previous_stride);
-            flat_len = len;
-        }
-        i_previous = i;
-    }
-    flat_lengths.push_back(flat_len);
-    flat_strides.push_back(boost::get<1>(*i_previous));
-
-    return {desc.GetType(), flat_lengths, flat_strides};
-}
-
-// Free Tensor Functions
-static void CreateBitmapAndGrid(unsigned int& bitmap,
-                                const std::vector<std::size_t>& a_lens,
-                                const std::vector<std::size_t>& c_lens,
-                                int& num_wg,
-                                int& work,
-                                int d)
-{
-    for(int i = d; i >= 0; i--)
-    {
-        if(a_lens[i] != 1)
-        {
-            bitmap |= (1 << (a_lens.size() - (i + 1)));
-            num_wg *= a_lens[i];
-        }
-        else
-        {
-            work *= c_lens[i];
-        }
-    }
-}
-
-static bool IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_one)
-{
-    bool leading_ones = true;
-
-    for(int i = first_not_one; i >= 0; i--)
-    {
-        bool is_one = (bitmap & (1 << (n_size - 1 - i))) != 0u;
-        leading_ones &= is_one;
-    }
-    return leading_ones;
-}
-
-void OpTensor3d(const Handle& handle,
-                miopenTensorOp_t tensorOp,
-                const void* alpha0,
-                const TensorDescriptor& aTensorDesc,
-                ConstData_t ATensor,
-                const void* alpha1,
-                const TensorDescriptor& bTensorDesc,
-                ConstData_t BTensor,
-                const void* beta,
-                const TensorDescriptor& cTensorDesc,
-                Data_t CTensor,
-                const size_t Aoffset,
-                const size_t Boffset,
-                const size_t Coffset,
-                const bool nonStandardSquash)
-{
-    auto alens = aTensorDesc.GetLengths();
-    auto blens = bTensorDesc.GetLengths();
-    auto clens = cTensorDesc.GetLengths();
-
-    auto astrides = aTensorDesc.GetStrides();
-    auto bstrides = bTensorDesc.GetStrides();
-    auto cstrides = cTensorDesc.GetStrides();
-
-    auto bsize = blens.size();
-
-    // first_not_one is incorrect if btensor size equal to 1
-    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
-    auto d             = std::distance(blens.begin(), first_not_one.base());
-
-    // quick fix
-    int num_wg      = first_not_one != blens.rend()
-                          ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
-                          : 1;
-    int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
-
-    unsigned int bitmap = 0;
-    // update bitmap for first_not_one
-    bitmap |= (1 << (bsize - d));
-
-    // (d-2) is because distance starts from 1 and 0
-    // also, we need to go past the "first_not_one" as that is already
-    // accounted for in the bitmap
-    CreateBitmapAndGrid(bitmap, blens, clens, num_wg, work_per_wg, static_cast<int>(d - 2));
-
-#if(MIO_TENSOROCL_DEBUG == 1)
-    printf("bitmap: %u\n", bitmap);
-    printf("work_per_wg: %d, num_wg: %d\n", work_per_wg, num_wg);
-#endif
-
-    int max_num_wg = 4096;
-    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
-
-    size_t local_threads = 256;
-
-    std::string network_config{};
-
-    network_config = std::to_string(bTensorDesc.GetType()) + "-" +
-                     std::to_string(aTensorDesc.GetType()) + "-" + std::to_string(tensorOp) + "-";
-
-    // for naive tensor ops
-    size_t RD_BLCK              = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
-    const std::string data_type = GetDataType(bTensorDesc.GetType());
-    const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
-
-    size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
-    size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
-
-    // opencl kernels are no longer supported, fallback to generic case
-    bool lite_applicable = grp_sz <= size_t(max_num_wg);
-
-    bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
-                   (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
-
-    bool is_squashed = nonStandardSquash && !is_lite &&
-                       (blens[0] == 1 && clens[0] == 1 && clens[1] == 1 && blens[2] == clens[2]);
-
-    grp_sz        = std::min(size_t(max_num_wg), grp_sz);
-    size_t glb_sz = local_threads * grp_sz;
-
-    size_t local_threads2 = 64;
-    size_t total_work2    = clens[1];
-    size_t grp_sz2        = (total_work2 + local_threads2 - 1) / local_threads2;
-    grp_sz2               = std::min(size_t(max_num_wg / grp_sz), grp_sz2);
-    size_t glb_sz2        = local_threads2 * grp_sz2;
-
-    visit_float(bTensorDesc.GetType(), [&](auto as_float) {
-        auto miopen_alpha0 = as_float(*(static_cast<const float*>(alpha0)));
-        auto miopen_alpha1 = as_float(*(static_cast<const float*>(alpha1)));
-        auto miopen_beta   = as_float(*(static_cast<const float*>(beta)));
-
-        if(lite_applicable && is_lite)
-        {
-
-            network_config += std::to_string(RD_BLCK) + "x" + std::to_string(local_threads) + "x" +
-                              std::to_string(grp_sz) + std::to_string(local_threads2) +
-                              std::to_string(grp_sz2);
-
-            auto&& kernels = handle.GetKernels("Op2dTensorLite", network_config);
-
-            if(!kernels.empty())
-            {
-                auto kernel = kernels.front();
-
-                kernel(ATensor,
-                       static_cast<int>(astrides[1]), // a_cstride,
-                       BTensor,
-                       static_cast<int>(bstrides[1]), // b_cstride,
-                       CTensor,
-                       static_cast<int>(cstrides[1]), // c_cstride,
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       static_cast<int64_t>(Aoffset),
-                       static_cast<int64_t>(Boffset),
-                       static_cast<int64_t>(Coffset),
-                       static_cast<int64_t>(total_work),
-                       static_cast<int64_t>(total_work2),
-                       static_cast<int>(!float_equal(miopen_beta, 0.0)),
-                       static_cast<int>(blens[1] == 1));
-
-                return;
-            }
-        }
-        else if(is_squashed)
-        {
-            network_config += std::to_string(RD_BLCK) + "x" + std::to_string(local_threads) + "x" +
-                              std::to_string(grp_sz);
-
-            auto&& kernels = handle.GetKernels("Op2dTensorSquash", network_config);
-
-            if(!kernels.empty())
-            {
-                auto kernel = kernels.front();
-
-                kernel(ATensor,
-                       BTensor,
-                       static_cast<int>(blens[1]),    // b_c,
-                       static_cast<int>(bstrides[1]), // b_cstride,
-                       CTensor,
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       static_cast<int64_t>(Aoffset),
-                       static_cast<int64_t>(Boffset),
-                       static_cast<int64_t>(Coffset),
-                       static_cast<int64_t>(total_work),
-                       static_cast<int>(!float_equal(miopen_alpha0, 0.0)),
-                       static_cast<int>(!float_equal(miopen_alpha1, 0.0)),
-                       static_cast<int>(!float_equal(miopen_beta, 0.0)));
-
-                return;
-            }
-        }
-        else
-        {
-
-            network_config += std::to_string(max_num_wg) + "-" + std::to_string(local_threads) +
-                              "x" + std::to_string(num_wg);
-
-            auto&& kernels = handle.GetKernels("Op3dTensorGeneric", network_config);
-
-            if(!kernels.empty())
-            {
-                auto kernel = kernels.front();
-
-                kernel(ATensor,
-                       BTensor,
-                       CTensor,
-                       static_cast<uint64_t>(Aoffset),
-                       static_cast<uint64_t>(Boffset),
-                       static_cast<uint64_t>(Coffset),
-                       static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]), // b_c,
-                       static_cast<uint32_t>(blens[2] == 1 ? clens[2] : blens[2]), // b_h,
-                       static_cast<uint32_t>(clens[1]),                            // c_c,
-                       static_cast<uint32_t>(clens[2]),                            // c_h,
-                       static_cast<uint32_t>(astrides[0]),                         // a_nstride,
-                       static_cast<uint32_t>(astrides[1]),                         // a_cstride,
-                       static_cast<uint32_t>(astrides[2]),                         // a_hstride,
-                       static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),     // b_nstride,
-                       static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),     // b_cstride,
-                       static_cast<uint32_t>(blens[2] == 1 ? 0 : bstrides[2]),     // b_hstride,
-                       static_cast<uint32_t>(cstrides[0]),                         // c_nstride,
-                       static_cast<uint32_t>(cstrides[1]),                         // c_cstride,
-                       static_cast<uint32_t>(cstrides[2]),                         // c_hstride,
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       static_cast<uint32_t>(clens[0]),
-                       !float_equal(miopen_beta, 0.0));
-
-                return;
-            }
-        }
-
-        std::string parms = " -DMIOPEN_TYPE=" + GetDataType(bTensorDesc.GetType());
-
-        parms += GetDataTypeKernelParams(aTensorDesc.GetType());
-
-        parms += " -DMIOPEN_TENSOR_OP=";
-        switch(tensorOp)
-        {
-        case 0: parms += "miopenAdd"; break;
-        case 1: parms += "miopenMul"; break;
-        case 2: parms += "miopenMin"; break;
-        case 3: parms += "miopenMax"; break;
-        }
-        std::string program_name = "MIOpenTensorKernels.cl";
-
-        if(lite_applicable && is_lite)
-        {
-            parms += " -DUSE_2D_TENSOR_LITE";
-            parms += " -DRD_BLCK=" + std::to_string(RD_BLCK) + " -DREAD_TYPE=" + READ_TYPE;
-
-            const std::vector<size_t> vld{local_threads, 1, 1};
-            const std::vector<size_t> vgd1{glb_sz, glb_sz2, 1};
-
-            handle.AddKernel(
-                "Op2dTensorLite", network_config, program_name, "Op2dTensorLite", vld, vgd1, parms)(
-                ATensor,
-                static_cast<int>(astrides[1]), // a_cstride,
-                BTensor,
-                static_cast<int>(bstrides[1]), // b_cstride,
-                CTensor,
-                static_cast<int>(cstrides[1]), // c_cstride,
-                miopen_alpha0,
-                miopen_alpha1,
-                miopen_beta,
-                static_cast<int64_t>(Aoffset),
-                static_cast<int64_t>(Boffset),
-                static_cast<int64_t>(Coffset),
-                static_cast<int64_t>(total_work),
-                static_cast<int64_t>(total_work2),
-                static_cast<int>(!float_equal(miopen_beta, 0.0)),
-                static_cast<int>(blens[1] == 1));
-        }
-        else if(is_squashed)
-        {
-            parms += " -DUSE_2D_TENSOR_SQUASH";
-            parms += " -DRD_BLCK=" + std::to_string(RD_BLCK) + " -DREAD_TYPE=" + READ_TYPE;
-
-            const std::vector<size_t> vld{local_threads, 1, 1};
-            const std::vector<size_t> vgd1{glb_sz, 1, 1};
-
-            handle.AddKernel("Op2dTensorSquash",
-                             network_config,
-                             program_name,
-                             "Op2dTensorSquash",
-                             vld,
-                             vgd1,
-                             parms)(ATensor,
-                                    BTensor,
-                                    static_cast<int>(blens[1]),    // b_c,
-                                    static_cast<int>(bstrides[1]), // b_cstride,
-                                    CTensor,
-                                    miopen_alpha0,
-                                    miopen_alpha1,
-                                    miopen_beta,
-                                    static_cast<int64_t>(Aoffset),
-                                    static_cast<int64_t>(Boffset),
-                                    static_cast<int64_t>(Coffset),
-                                    static_cast<int64_t>(total_work),
-                                    static_cast<int>(!float_equal(miopen_alpha0, 0.0)),
-                                    static_cast<int>(!float_equal(miopen_alpha1, 0.0)),
-                                    static_cast<int>(!float_equal(miopen_beta, 0.0)));
-        }
-        else
-        {
-            // Special case for adding tensors in place
-            program_name  = "MIOpenTensorKernelsHip.cpp";
-            local_threads = 32;
-            num_wg        = std::clamp(
-                (clens[0] * clens[1] * clens[2]) / local_threads, size_t(1), size_t(max_num_wg));
-            num_wg = num_wg > max_num_wg ? max_num_wg : num_wg;
-
-            size_t global_threads;
-            global_threads = num_wg * local_threads;
-            const std::vector<size_t> vld{local_threads, 1, 1};
-            const std::vector<size_t> vgd{global_threads, 1, 1};
-
-            parms += " -DUSE_3D_TENSOR_GENERIC";
-
-            handle.AddKernel("Op3dTensorGeneric",
-                             network_config,
-                             program_name,
-                             "Op3dTensorGeneric",
-                             vld,
-                             vgd,
-                             parms)(
-                ATensor,
-                BTensor,
-                CTensor,
-                static_cast<uint64_t>(Aoffset),
-                static_cast<uint64_t>(Boffset),
-                static_cast<uint64_t>(Coffset),
-                static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]), // b_c,
-                static_cast<uint32_t>(blens[2] == 1 ? clens[2] : blens[2]), // b_h,
-                static_cast<uint32_t>(clens[1]),                            // c_c,
-                static_cast<uint32_t>(clens[2]),                            // c_h,
-                static_cast<uint32_t>(astrides[0]),                         // a_nstride,
-                static_cast<uint32_t>(astrides[1]),                         // a_cstride,
-                static_cast<uint32_t>(astrides[2]),                         // a_hstride,
-                static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),     // b_nstride,
-                static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),     // b_cstride,
-                static_cast<uint32_t>(blens[2] == 1 ? 0 : bstrides[2]),     // b_hstride,
-                static_cast<uint32_t>(cstrides[0]),                         // c_nstride,
-                static_cast<uint32_t>(cstrides[1]),                         // c_cstride,
-                static_cast<uint32_t>(cstrides[2]),                         // c_hstride,
-                miopen_alpha0,
-                miopen_alpha1,
-                miopen_beta,
-                static_cast<uint32_t>(clens[0]),
-                !float_equal(miopen_beta, 0.0));
-        }
-    });
-}
-
-void OpTensor4d(const Handle& handle,
-                miopenTensorOp_t tensorOp,
-                const void* alpha0,
-                const TensorDescriptor& aTensorDesc,
-                ConstData_t ATensor,
-                const void* alpha1,
-                const TensorDescriptor& bTensorDesc,
-                ConstData_t BTensor,
-                const void* beta,
-                const TensorDescriptor& cTensorDesc,
-                Data_t CTensor,
-                const size_t Aoffset,
-                const size_t Boffset,
-                const size_t Coffset)
-{
-    auto blens = bTensorDesc.GetLengths();
-    auto clens = cTensorDesc.GetLengths();
-    auto dims  = clens.size();
-
-    auto astrides = aTensorDesc.GetStrides();
-    auto bstrides = bTensorDesc.GetStrides();
-    auto bsize    = blens.size();
-    auto cstrides = cTensorDesc.GetStrides();
-
-    // first_not_one is incorrect if btensor size equal to 1
-    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
-    auto d             = std::distance(blens.begin(), first_not_one.base());
-
-    // quick fix
-    int num_wg      = first_not_one != blens.rend()
-                          ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
-                          : 1;
-    int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
-
-    unsigned int bitmap = 0;
-    // update bitmap for first_not_one
-    bitmap |= (1 << (bsize - d));
-
-    // (d-2) is because distance starts from 1 and 0
-    // also, we need to go past the "first_not_one" as that is already
-    // accounted for in the bitmap
-    CreateBitmapAndGrid(bitmap, blens, clens, num_wg, work_per_wg, static_cast<int>(d - 2));
-
-    // quick fix for btensor = <1, 1, 1, 1>
-    if(bTensorDesc.GetElementSize() == 1)
-        bitmap = 4;
-
-#if(MIO_TENSOROCL_DEBUG == 1)
-    printf("bitmap: %u\n", bitmap);
-    printf("work_per_wg: %d, num_wg: %d\n", work_per_wg, num_wg);
-#endif
-
-    // Forward Convolution Bias specialization
-    // for fwd-bias, bitmap looks like <0, 1, 0, 0>
-    // Is the no. of work-groups and the work for each wg balanced?
-    auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
-    auto incr_wg       = 0;
-    // This block gives off indexing for 5d tensors, skipping
-    if(fwd_conv_bias == 1 && dims < 5 && num_wg < 640 && work_per_wg > 256 && clens[0] > 0)
-    { // 640 workgroups of size 256 needed to completely fill the GPU
-
-        work_per_wg /= clens[0]; // c_n;
-        num_wg *= clens[0];      // c_n;
-        incr_wg = 1;
-    }
-
-    int num_wg_orig = num_wg;
-    int max_num_wg  = 4096;
-    num_wg          = num_wg > max_num_wg ? max_num_wg : num_wg;
-
-    size_t local_threads = 256;
-
-    // Does the bitmap contain leading ones, i.e. 1,1,1,0 or 1,1,0,0
-    // or 1,1,1,1 or 1,0,0,0
-    bool leading_ones = IsBitmapLeadingOnes(bitmap, dims, static_cast<int>(d - 2));
-    if(leading_ones && work_per_wg < 64)
-    {
-        local_threads = 64;
-    }
-
-    std::string program_name = "MIOpenTensorKernels.cl";
-
-    const std::vector<size_t> vld{local_threads, 1, 1};
-
-    // Special case for adding tensors in place
-    size_t global_threads;
-    global_threads =
-        (static_cast<int>(leading_ones) == 1 && (d - 1) == 3) ? num_wg : num_wg * local_threads;
-    global_threads = (global_threads < local_threads) ? local_threads : global_threads;
-
-    const std::vector<size_t> vgd{global_threads, 1, 1};
-
-    bool packed_tensor = true;
-
-    // auto alens = aTensorDesc.GetLengths();
-    packed_tensor &= aTensorDesc.IsPacked();
-    packed_tensor &= bTensorDesc.IsPacked();
-    packed_tensor &= cTensorDesc.IsPacked();
-
-    bool packed_equal_tensor =
-        packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
-
-#if(MIO_TENSOROCL_DEBUG == 1)
-    printf("packed_tensor: %d\n", packed_tensor);
-    printf("equal_tensor: %d\n", bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
-#endif
-
-    // for naive tensor ops
-    const std::string data_type = GetDataType(bTensorDesc.GetType());
-
-    size_t TENS_LEN             = cTensorDesc.GetElementSize();
-    size_t RD_BLCK              = (TENS_LEN % 4 == 0) ? 4 : (TENS_LEN % 2 == 0) ? 2 : 1;
-    const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
-
-    size_t total_work = std::max(TENS_LEN / RD_BLCK, size_t(1));
-    size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
-    grp_sz            = std::min(size_t(max_num_wg), grp_sz);
-    size_t glb_sz     = local_threads * grp_sz;
-
-    std::string network_config{};
-    network_config +=
-        std::to_string(bTensorDesc.GetType()) + "-" + std::to_string(aTensorDesc.GetType()) + "-" +
-        std::to_string(tensorOp) + "-" + std::to_string(max_num_wg) + "-" +
-        ((fwd_conv_bias == 0 && packed_equal_tensor) ? "" : std::to_string(global_threads)) + "-" +
-        std::to_string(local_threads);
-
-    visit_float(bTensorDesc.GetType(), [&](auto as_float) {
-        auto miopen_alpha0 = as_float(*(static_cast<const float*>(alpha0)));
-        auto miopen_alpha1 = as_float(*(static_cast<const float*>(alpha1)));
-        auto miopen_beta   = as_float(*(static_cast<const float*>(beta)));
-
-        if(fwd_conv_bias != 0)
-        {
-            if(packed_tensor)
-            {
-                auto&& kernels = handle.GetKernels("OpTensorFwdBias", network_config);
-
-                if(!kernels.empty())
-                {
-                    auto kernel = kernels.front();
-                    kernel(ATensor,
-                           BTensor,
-                           static_cast<int>(blens[1]),
-                           CTensor,
-                           static_cast<int>(clens[0]),
-                           static_cast<int>(cstrides[0]),
-                           static_cast<int>(cstrides[1]),
-                           work_per_wg,
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           static_cast<int64_t>(Aoffset),
-                           static_cast<int64_t>(Boffset),
-                           static_cast<int64_t>(Coffset),
-                           static_cast<int>(num_wg_orig),
-                           static_cast<int>(incr_wg));
-
-                    return;
-                }
-            }
-            else
-            {
-
-                auto&& kernels = handle.GetKernels("OpTensorFwdBiasGeneric", network_config);
-
-                if(!kernels.empty())
-                {
-                    auto kernel = kernels.front();
-                    kernel(ATensor,
-                           static_cast<int>(astrides[0]),
-                           static_cast<int>(astrides[1]),
-                           static_cast<int>(astrides[2]),
-                           BTensor,
-                           static_cast<int>(blens[1]),
-                           static_cast<int>(bstrides[1]),
-                           CTensor,
-                           static_cast<int>(clens[0]),
-                           static_cast<int>(clens[3]),
-                           static_cast<int>(cstrides[0]),
-                           static_cast<int>(cstrides[1]),
-                           static_cast<int>(cstrides[2]),
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           work_per_wg,
-                           static_cast<int64_t>(Aoffset),
-                           static_cast<int64_t>(Boffset),
-                           static_cast<int64_t>(Coffset),
-                           static_cast<int>(num_wg_orig),
-                           static_cast<int>(incr_wg));
-                    return;
-                }
-            }
-        }
-        // precede leading_ones for bitmap = 1,1,1,1
-        else if(packed_equal_tensor)
-        {
-            network_config += "x" + std::to_string(grp_sz) + "x" + std::to_string(RD_BLCK);
-            auto&& kernels = handle.GetKernels("Op4dTensorLite", network_config);
-            if(!kernels.empty())
-            {
-                auto kernel = kernels.front();
-                kernel(ATensor,
-                       BTensor,
-                       CTensor,
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       static_cast<int64_t>(Aoffset),
-                       static_cast<int64_t>(Boffset),
-                       static_cast<int64_t>(Coffset),
-                       static_cast<int64_t>(total_work),
-                       static_cast<int>(!float_equal(miopen_beta, 0.0)));
-                return;
-            }
-        }
-        else if(leading_ones)
-        {
-            if(packed_tensor)
-            {
-
-                auto&& kernels = handle.GetKernels("OpTensorLeadingOnes", network_config);
-
-                if(!kernels.empty())
-                {
-                    auto kernel = kernels.front();
-                    kernel(ATensor,
-                           BTensor,
-                           CTensor,
-                           static_cast<int>(clens[1]),
-                           static_cast<int>(clens[2]),
-                           static_cast<int>(clens[3]),
-                           static_cast<int>(cstrides[0]),
-                           static_cast<int>(cstrides[1]),
-                           work_per_wg,
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           static_cast<int64_t>(Aoffset),
-                           static_cast<int64_t>(Boffset),
-                           static_cast<int64_t>(Coffset),
-                           static_cast<int>(num_wg_orig),
-                           bitmap);
-
-                    return;
-                }
-            }
-            else
-            {
-                auto&& kernels = handle.GetKernels("OpTensorLeadingOnesGeneric", network_config);
-
-                if(!kernels.empty())
-                {
-                    auto kernel = kernels.front();
-                    kernel(ATensor,
-                           static_cast<int>(astrides[0]),
-                           static_cast<int>(astrides[1]),
-                           static_cast<int>(astrides[2]),
-                           BTensor,
-                           static_cast<int>(bstrides[0]),
-                           static_cast<int>(bstrides[1]),
-                           static_cast<int>(bstrides[2]),
-                           CTensor,
-                           static_cast<int>(clens[1]),
-                           static_cast<int>(clens[2]),
-                           static_cast<int>(clens[3]),
-                           static_cast<int>(cstrides[0]),
-                           static_cast<int>(cstrides[1]),
-                           static_cast<int>(cstrides[2]),
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           work_per_wg,
-                           static_cast<int64_t>(Aoffset),
-                           static_cast<int64_t>(Boffset),
-                           static_cast<int64_t>(Coffset),
-                           static_cast<int>(num_wg_orig),
-                           bitmap);
-                    return;
-                }
-            }
-        }
-        else
-        {
-            auto&& kernels = handle.GetKernels("Op4dTensorGeneric", network_config);
-
-            if(!kernels.empty())
-            {
-                auto kernel = kernels.front();
-                kernel(ATensor,
-                       static_cast<int>(astrides[0]), // a_nstride,
-                       static_cast<int>(astrides[1]), // a_cstride,
-                       static_cast<int>(astrides[2]), // a_hstride,
-                       BTensor,
-                       static_cast<int>(blens[1]),    // b_c,
-                       static_cast<int>(blens[2]),    // b_h,
-                       static_cast<int>(blens[3]),    // b_w,
-                       static_cast<int>(bstrides[0]), // b_nstride,
-                       static_cast<int>(bstrides[1]), // b_cstride,
-                       static_cast<int>(bstrides[2]), // b_hstride,
-                       CTensor,
-                       static_cast<int>(clens[1]),    // c_c,
-                       static_cast<int>(clens[2]),    // c_h,
-                       static_cast<int>(clens[3]),    // c_w,
-                       static_cast<int>(cstrides[0]), // c_nstride,
-                       static_cast<int>(cstrides[1]), // c_cstride,
-                       static_cast<int>(cstrides[2]), // c_hstride,
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       bitmap,
-                       work_per_wg,
-                       static_cast<int64_t>(Aoffset),
-                       static_cast<int64_t>(Boffset),
-                       static_cast<int64_t>(Coffset),
-                       static_cast<int>(num_wg_orig));
-                return;
-            }
-        }
-
-        std::string parms = " -DMIOPEN_TYPE=" + GetDataType(bTensorDesc.GetType()) +
-                            " -DMAX_NUM_WG=" + std::to_string(max_num_wg);
-
-        parms += GetDataTypeKernelParams(aTensorDesc.GetType());
-
-        parms += " -DMIOPEN_TENSOR_OP=";
-        switch(tensorOp)
-        {
-        case 0: parms += "miopenAdd"; break;
-        case 1: parms += "miopenMul"; break;
-        case 2: parms += "miopenMin"; break;
-        case 3: parms += "miopenMax"; break;
-        }
-
-        if(fwd_conv_bias != 0)
-        {
-            if(packed_tensor)
-            {
-                parms += " -DUSE_FWD_BIAS";
-
-                handle.AddKernel("OpTensorFwdBias",
-                                 network_config,
-                                 program_name,
-                                 "OpTensorFwdBias",
-                                 vld,
-                                 vgd,
-                                 parms)(ATensor,
-                                        BTensor,
-                                        static_cast<int>(blens[1]),
-                                        CTensor,
-                                        static_cast<int>(clens[0]),
-                                        static_cast<int>(cstrides[0]),
-                                        static_cast<int>(cstrides[1]),
-                                        work_per_wg,
-                                        miopen_alpha0,
-                                        miopen_alpha1,
-                                        miopen_beta,
-                                        static_cast<int64_t>(Aoffset),
-                                        static_cast<int64_t>(Boffset),
-                                        static_cast<int64_t>(Coffset),
-                                        static_cast<int>(num_wg_orig),
-                                        static_cast<int>(incr_wg));
-            }
-            else
-            {
-                parms += " -DUSE_FWD_BIAS_GENERIC";
-                handle.AddKernel("OpTensorFwdBiasGeneric",
-                                 network_config,
-                                 program_name,
-                                 "OpTensorFwdBiasGeneric",
-                                 vld,
-                                 vgd,
-                                 parms)(ATensor,
-                                        static_cast<int>(astrides[0]),
-                                        static_cast<int>(astrides[1]),
-                                        static_cast<int>(astrides[2]),
-                                        BTensor,
-                                        static_cast<int>(blens[1]),
-                                        static_cast<int>(bstrides[1]),
-                                        CTensor,
-                                        static_cast<int>(clens[0]),
-                                        static_cast<int>(clens[3]),
-                                        static_cast<int>(cstrides[0]),
-                                        static_cast<int>(cstrides[1]),
-                                        static_cast<int>(cstrides[2]),
-                                        miopen_alpha0,
-                                        miopen_alpha1,
-                                        miopen_beta,
-                                        work_per_wg,
-                                        static_cast<int64_t>(Aoffset),
-                                        static_cast<int64_t>(Boffset),
-                                        static_cast<int64_t>(Coffset),
-                                        static_cast<int>(num_wg_orig),
-                                        static_cast<int>(incr_wg));
-            }
-        }
-        // precede leading_ones for bitmap = 1,1,1,1
-        else if(packed_equal_tensor)
-        {
-            parms += " -DUSE_4D_TENSOR_LITE";
-            parms += " -DRD_BLCK=" + std::to_string(RD_BLCK) + " -DREAD_TYPE=" + READ_TYPE;
-
-            const std::vector<size_t> vgd1{glb_sz, 1, 1};
-
-            handle.AddKernel(
-                "Op4dTensorLite", network_config, program_name, "Op4dTensorLite", vld, vgd1, parms)(
-                ATensor,
-                BTensor,
-                CTensor,
-                miopen_alpha0,
-                miopen_alpha1,
-                miopen_beta,
-                static_cast<int64_t>(Aoffset),
-                static_cast<int64_t>(Boffset),
-                static_cast<int64_t>(Coffset),
-                static_cast<int64_t>(total_work),
-                static_cast<int>(!float_equal(miopen_beta, 0.0)));
-        }
-        else if(leading_ones)
-        {
-            if(packed_tensor)
-            {
-                parms += " -DUSE_LEADING_ONES";
-                handle.AddKernel("OpTensorLeadingOnes",
-                                 network_config,
-                                 program_name,
-                                 "OpTensorLeadingOnes",
-                                 vld,
-                                 vgd,
-                                 parms)(ATensor,
-                                        BTensor,
-                                        CTensor,
-                                        static_cast<int>(clens[1]),
-                                        static_cast<int>(clens[2]),
-                                        static_cast<int>(clens[3]),
-                                        static_cast<int>(cstrides[0]),
-                                        static_cast<int>(cstrides[1]),
-                                        work_per_wg,
-                                        miopen_alpha0,
-                                        miopen_alpha1,
-                                        miopen_beta,
-                                        static_cast<int64_t>(Aoffset),
-                                        static_cast<int64_t>(Boffset),
-                                        static_cast<int64_t>(Coffset),
-                                        static_cast<int>(num_wg_orig),
-                                        bitmap);
-            }
-            else
-            {
-
-                parms += " -DUSE_LEADING_ONES_GENERIC";
-
-                handle.AddKernel("OpTensorLeadingOnesGeneric",
-                                 network_config,
-                                 program_name,
-                                 "OpTensorLeadingOnesGeneric",
-                                 vld,
-                                 vgd,
-                                 parms)(ATensor,
-                                        static_cast<int>(astrides[0]),
-                                        static_cast<int>(astrides[1]),
-                                        static_cast<int>(astrides[2]),
-                                        BTensor,
-                                        static_cast<int>(bstrides[0]),
-                                        static_cast<int>(bstrides[1]),
-                                        static_cast<int>(bstrides[2]),
-                                        CTensor,
-                                        static_cast<int>(clens[1]),
-                                        static_cast<int>(clens[2]),
-                                        static_cast<int>(clens[3]),
-                                        static_cast<int>(cstrides[0]),
-                                        static_cast<int>(cstrides[1]),
-                                        static_cast<int>(cstrides[2]),
-                                        miopen_alpha0,
-                                        miopen_alpha1,
-                                        miopen_beta,
-                                        work_per_wg,
-                                        static_cast<int64_t>(Aoffset),
-                                        static_cast<int64_t>(Boffset),
-                                        static_cast<int64_t>(Coffset),
-                                        static_cast<int>(num_wg_orig),
-                                        bitmap);
-            }
-        }
-        else
-        {
-            parms += " -DUSE_4D_TENSOR_GENERIC";
-
-            handle.AddKernel("Op4dTensorGeneric",
-                             network_config,
-                             program_name,
-                             "Op4dTensorGeneric",
-                             vld,
-                             vgd,
-                             parms)(ATensor,
-                                    static_cast<int>(astrides[0]), // a_nstride,
-                                    static_cast<int>(astrides[1]), // a_cstride,
-                                    static_cast<int>(astrides[2]), // a_hstride,
-                                    BTensor,
-                                    static_cast<int>(blens[1]),    // b_c,
-                                    static_cast<int>(blens[2]),    // b_h,
-                                    static_cast<int>(blens[3]),    // b_w,
-                                    static_cast<int>(bstrides[0]), // b_nstride,
-                                    static_cast<int>(bstrides[1]), // b_cstride,
-                                    static_cast<int>(bstrides[2]), // b_hstride,
-                                    CTensor,
-                                    static_cast<int>(clens[1]),    // c_c,
-                                    static_cast<int>(clens[2]),    // c_h,
-                                    static_cast<int>(clens[3]),    // c_w,
-                                    static_cast<int>(cstrides[0]), // c_nstride,
-                                    static_cast<int>(cstrides[1]), // c_cstride,
-                                    static_cast<int>(cstrides[2]), // c_hstride,
-                                    miopen_alpha0,
-                                    miopen_alpha1,
-                                    miopen_beta,
-                                    bitmap,
-                                    work_per_wg,
-                                    static_cast<int64_t>(Aoffset),
-                                    static_cast<int64_t>(Boffset),
-                                    static_cast<int64_t>(Coffset),
-                                    static_cast<int>(num_wg_orig));
-        }
-    });
-}
-
-void OpTensorOther(const Handle& handle,
-                   miopenTensorOp_t tensorOp,
-                   const void* alpha0,
-                   const TensorDescriptor& aTensorDesc,
-                   ConstData_t ATensor,
-                   const void* alpha1,
-                   const TensorDescriptor& bTensorDesc,
-                   ConstData_t BTensor,
-                   const void* beta,
-                   const TensorDescriptor& cTensorDesc,
-                   Data_t CTensor,
-                   const size_t Aoffset,
-                   const size_t Boffset,
-                   const size_t Coffset)
-{
-    auto blens = bTensorDesc.GetLengths();
-    auto clens = cTensorDesc.GetLengths();
-
-    auto astrides = aTensorDesc.GetStrides();
-    auto bstrides = bTensorDesc.GetStrides();
-    auto bsize    = blens.size();
-    auto cstrides = cTensorDesc.GetStrides();
-
-    const bool case_1d = bsize == 1;
-    const bool case_2d = bsize == 2;
-    const bool case_5d = bsize == 5;
-
-    const bool use_hip = case_1d || case_2d;
-
-    // first_not_one is incorrect if btensor size equal to 1
-    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
-    auto d             = std::distance(blens.begin(), first_not_one.base());
-
-    // quick fix
-    int num_wg      = first_not_one != blens.rend()
-                          ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
-                          : 1;
-    int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
-
-    unsigned int bitmap = 0;
-    // update bitmap for first_not_one
-    bitmap |= (1 << (bsize - d));
-
-    // (d-2) is because distance starts from 1 and 0
-    // also, we need to go past the "first_not_one" as that is already
-    // accounted for in the bitmap
-    CreateBitmapAndGrid(bitmap, blens, clens, num_wg, work_per_wg, static_cast<int>(d - 2));
-
-#if(MIO_TENSOROCL_DEBUG == 1)
-    printf("bitmap: %u\n", bitmap);
-    printf("work_per_wg: %d, num_wg: %d\n", work_per_wg, num_wg);
-#endif
-
-    int num_wg_orig = num_wg;
-    int max_num_wg  = 4096;
-
-    size_t local_threads = 256;
-
-    if(case_2d)
-        local_threads = 32;
-
-    if(case_1d)
-        num_wg = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg));
-    if(case_2d)
-        num_wg = std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg));
-    num_wg = num_wg > max_num_wg ? max_num_wg : num_wg;
-
-    const std::vector<size_t> vld{local_threads, 1, 1};
-
-    // Special case for adding tensors in place
-    size_t global_threads;
-    global_threads = num_wg * local_threads;
-
-    const std::vector<size_t> vgd{global_threads, 1, 1};
-
-    std::string program_name = use_hip ? "MIOpenTensorKernelsHip.cpp" : "MIOpenTensorKernels.cl";
-
-    std::string network_config{};
-    network_config += std::to_string(bTensorDesc.GetType()) + "-" +
-                      std::to_string(aTensorDesc.GetType()) + "-" + std::to_string(tensorOp) + "-" +
-                      std::to_string(global_threads) + "-" + std::to_string(local_threads);
-
-    if(case_1d || case_2d)
-    {
-        if(aTensorDesc.AllDimsFitIntoInt())
-        {
-            network_config += "-32bit";
-        }
-        else
-        {
-            network_config += "-64bit";
-        }
-    }
-
-    visit_float(bTensorDesc.GetType(), [&](auto as_float) {
-        auto miopen_alpha0 = as_float(*(static_cast<const float*>(alpha0)));
-        auto miopen_alpha1 = as_float(*(static_cast<const float*>(alpha1)));
-        auto miopen_beta   = as_float(*(static_cast<const float*>(beta)));
-
-        if(case_5d)
-        {
-            auto&& kernels = handle.GetKernels("Op5dTensorGeneric", network_config);
-
-            if(!kernels.empty())
-            {
-                auto kernel = kernels.front();
-                kernel(ATensor,
-                       static_cast<int>(astrides[0]),
-                       static_cast<int>(astrides[1]),
-                       static_cast<int>(astrides[2]),
-                       static_cast<int>(astrides[3]),
-                       BTensor,
-                       static_cast<int>(blens[1]),    // b_c,
-                       static_cast<int>(blens[2]),    // b_d,
-                       static_cast<int>(blens[3]),    // b_h,
-                       static_cast<int>(blens[4]),    // b_w,
-                       static_cast<int>(bstrides[0]), // b_nstride,
-                       static_cast<int>(bstrides[1]), // b_cstride,
-                       static_cast<int>(bstrides[2]), // b_dstride,
-                       static_cast<int>(bstrides[3]), // b_hstride,
-                       CTensor,
-                       static_cast<int>(clens[1]),    // c_c,
-                       static_cast<int>(clens[2]),    // c_d,
-                       static_cast<int>(clens[3]),    // c_h,
-                       static_cast<int>(clens[4]),    // c_w,
-                       static_cast<int>(cstrides[0]), // c_nstride,
-                       static_cast<int>(cstrides[1]), // c_cstride,
-                       static_cast<int>(cstrides[2]), // c_dstride,
-                       static_cast<int>(cstrides[3]), // c_hstride,
-                       miopen_alpha0,
-                       miopen_alpha1,
-                       miopen_beta,
-                       bitmap,
-                       work_per_wg,
-                       static_cast<int64_t>(Aoffset),
-                       static_cast<int64_t>(Boffset),
-                       static_cast<int64_t>(Coffset),
-                       static_cast<int>(num_wg_orig));
-                return;
-            }
-        }
-        else if(case_2d)
-        {
-            auto&& kernels = handle.GetKernels("Op2dTensorGeneric", network_config);
-
-            if(!kernels.empty())
-            {
-                auto kernel = kernels.front();
-
-                if(aTensorDesc.AllDimsFitIntoInt())
-                {
-                    kernel(ATensor,
-                           BTensor,
-                           CTensor,
-                           static_cast<uint64_t>(Aoffset),
-                           static_cast<uint64_t>(Boffset),
-                           static_cast<uint64_t>(Coffset),
-                           static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]),
-                           static_cast<uint32_t>(clens[1]),
-                           static_cast<uint32_t>(astrides[0]),
-                           static_cast<uint32_t>(astrides[1]),
-                           static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                           static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),
-                           static_cast<uint32_t>(cstrides[0]),
-                           static_cast<uint32_t>(cstrides[1]),
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           static_cast<uint32_t>(clens[0]),
-                           !float_equal(miopen_beta, 0.0));
-                }
-                else
-                {
-                    kernel(ATensor,
-                           BTensor,
-                           CTensor,
-                           static_cast<uint64_t>(Aoffset),
-                           static_cast<uint64_t>(Boffset),
-                           static_cast<uint64_t>(Coffset),
-                           static_cast<uint64_t>(blens[1] == 1 ? clens[1] : blens[1]),
-                           static_cast<uint64_t>(clens[1]),
-                           static_cast<uint64_t>(astrides[0]),
-                           static_cast<uint64_t>(astrides[1]),
-                           static_cast<uint64_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                           static_cast<uint64_t>(blens[1] == 1 ? 0 : bstrides[1]),
-                           static_cast<uint64_t>(cstrides[0]),
-                           static_cast<uint64_t>(cstrides[1]),
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           static_cast<uint64_t>(clens[0]),
-                           !float_equal(miopen_beta, 0.0));
-                }
-
-                return;
-            }
-        }
-        else if(case_1d)
-        {
-            auto&& kernels = handle.GetKernels("Op1dTensorGeneric", network_config);
-
-            if(!kernels.empty())
-            {
-
-                auto kernel = kernels.front();
-
-                if(aTensorDesc.AllDimsFitIntoInt())
-                {
-                    kernel(ATensor,
-                           BTensor,
-                           CTensor,
-                           static_cast<uint64_t>(Aoffset),
-                           static_cast<uint64_t>(Boffset),
-                           static_cast<uint64_t>(Coffset),
-                           static_cast<uint32_t>(astrides[0]),
-                           static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                           static_cast<uint32_t>(cstrides[0]),
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           static_cast<uint32_t>(clens[0]),
-                           !float_equal(miopen_beta, 0.0));
-                }
-                else
-                {
-                    kernel(ATensor,
-                           BTensor,
-                           CTensor,
-                           static_cast<uint64_t>(Aoffset),
-                           static_cast<uint64_t>(Boffset),
-                           static_cast<uint64_t>(Coffset),
-                           static_cast<uint64_t>(astrides[0]),
-                           static_cast<uint64_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                           static_cast<uint64_t>(cstrides[0]),
-                           miopen_alpha0,
-                           miopen_alpha1,
-                           miopen_beta,
-                           static_cast<uint64_t>(clens[0]),
-                           !float_equal(miopen_beta, 0.0));
-                }
-
-                return;
-            }
-        }
-
-        std::string parms = " -DMIOPEN_TYPE=" + GetDataType(bTensorDesc.GetType()) +
-                            " -DMAX_NUM_WG=" + std::to_string(max_num_wg);
-
-        parms += GetDataTypeKernelParams(aTensorDesc.GetType());
-
-        parms += " -DMIOPEN_TENSOR_OP=";
-        switch(tensorOp)
-        {
-        case 0: parms += "miopenAdd"; break;
-        case 1: parms += "miopenMul"; break;
-        case 2: parms += "miopenMin"; break;
-        case 3: parms += "miopenMax"; break;
-        }
-
-        if(aTensorDesc.AllDimsFitIntoInt())
-        {
-            parms += " -DDIM_TYPE=uint32_t";
-        }
-        else
-        {
-            parms += " -DDIM_TYPE=uint64_t";
-        }
-
-        if(case_5d)
-        {
-            parms += " -DUSE_5D_TENSOR_GENERIC";
-
-            handle.AddKernel("Op5dTensorGeneric",
-                             network_config,
-                             program_name,
-                             "Op5dTensorGeneric",
-                             vld,
-                             vgd,
-                             parms)(ATensor,
-                                    static_cast<int>(astrides[0]),
-                                    static_cast<int>(astrides[1]),
-                                    static_cast<int>(astrides[2]),
-                                    static_cast<int>(astrides[3]),
-                                    BTensor,
-                                    static_cast<int>(blens[1]),    // b_c,
-                                    static_cast<int>(blens[2]),    // b_d,
-                                    static_cast<int>(blens[3]),    // b_h,
-                                    static_cast<int>(blens[4]),    // b_w,
-                                    static_cast<int>(bstrides[0]), // b_nstride,
-                                    static_cast<int>(bstrides[1]), // b_cstride,
-                                    static_cast<int>(bstrides[2]), // b_dstride,
-                                    static_cast<int>(bstrides[3]), // b_hstride,
-                                    CTensor,
-                                    static_cast<int>(clens[1]),    // c_c,
-                                    static_cast<int>(clens[2]),    // c_d,
-                                    static_cast<int>(clens[3]),    // c_h,
-                                    static_cast<int>(clens[4]),    // c_w,
-                                    static_cast<int>(cstrides[0]), // c_nstride,
-                                    static_cast<int>(cstrides[1]), // c_cstride,
-                                    static_cast<int>(cstrides[2]), // c_dstride,
-                                    static_cast<int>(cstrides[3]), // c_hstride,
-                                    miopen_alpha0,
-                                    miopen_alpha1,
-                                    miopen_beta,
-                                    bitmap,
-                                    work_per_wg,
-                                    static_cast<int64_t>(Aoffset),
-                                    static_cast<int64_t>(Boffset),
-                                    static_cast<int64_t>(Coffset),
-                                    static_cast<int>(num_wg_orig));
-        }
-        else if(case_2d)
-        {
-            parms += " -DUSE_2D_TENSOR_GENERIC";
-
-            if(aTensorDesc.AllDimsFitIntoInt())
-            {
-                handle.AddKernel("Op2dTensorGeneric",
-                                 network_config,
-                                 program_name,
-                                 "Op2dTensorGeneric",
-                                 vld,
-                                 vgd,
-                                 parms)(ATensor,
-                                        BTensor,
-                                        CTensor,
-                                        static_cast<uint64_t>(Aoffset),
-                                        static_cast<uint64_t>(Boffset),
-                                        static_cast<uint64_t>(Coffset),
-                                        static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]),
-                                        static_cast<uint32_t>(clens[1]),
-                                        static_cast<uint32_t>(astrides[0]),
-                                        static_cast<uint32_t>(astrides[1]),
-                                        static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                                        static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),
-                                        static_cast<uint32_t>(cstrides[0]),
-                                        static_cast<uint32_t>(cstrides[1]),
-                                        miopen_alpha0,
-                                        miopen_alpha1,
-                                        miopen_beta,
-                                        static_cast<uint32_t>(clens[0]),
-                                        !float_equal(miopen_beta, 0.0));
-            }
-            else
-            {
-                handle.AddKernel("Op2dTensorGeneric",
-                                 network_config,
-                                 program_name,
-                                 "Op2dTensorGeneric",
-                                 vld,
-                                 vgd,
-                                 parms)(ATensor,
-                                        BTensor,
-                                        CTensor,
-                                        static_cast<uint64_t>(Aoffset),
-                                        static_cast<uint64_t>(Boffset),
-                                        static_cast<uint64_t>(Coffset),
-                                        static_cast<uint64_t>(blens[1] == 1 ? clens[1] : blens[1]),
-                                        static_cast<uint64_t>(clens[1]),
-                                        static_cast<uint64_t>(astrides[0]),
-                                        static_cast<uint64_t>(astrides[1]),
-                                        static_cast<uint64_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                                        static_cast<uint64_t>(blens[1] == 1 ? 0 : bstrides[1]),
-                                        static_cast<uint64_t>(cstrides[0]),
-                                        static_cast<uint64_t>(cstrides[1]),
-                                        miopen_alpha0,
-                                        miopen_alpha1,
-                                        miopen_beta,
-                                        static_cast<uint64_t>(clens[0]),
-                                        !float_equal(miopen_beta, 0.0));
-            }
-        }
-        else if(case_1d)
-        {
-            parms += " -DUSE_1D_TENSOR_GENERIC";
-
-            if(aTensorDesc.AllDimsFitIntoInt())
-            {
-                handle.AddKernel("Op1dTensorGeneric",
-                                 network_config,
-                                 program_name,
-                                 "Op1dTensorGeneric",
-                                 vld,
-                                 vgd,
-                                 parms)(ATensor,
-                                        BTensor,
-                                        CTensor,
-                                        static_cast<uint64_t>(Aoffset),
-                                        static_cast<uint64_t>(Boffset),
-                                        static_cast<uint64_t>(Coffset),
-                                        static_cast<uint32_t>(astrides[0]),
-                                        static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                                        static_cast<uint32_t>(cstrides[0]),
-                                        miopen_alpha0,
-                                        miopen_alpha1,
-                                        miopen_beta,
-                                        static_cast<uint32_t>(clens[0]),
-                                        !float_equal(miopen_beta, 0.0));
-            }
-            else
-            {
-                handle.AddKernel("Op1dTensorGeneric",
-                                 network_config,
-                                 program_name,
-                                 "Op1dTensorGeneric",
-                                 vld,
-                                 vgd,
-                                 parms)(ATensor,
-                                        BTensor,
-                                        CTensor,
-                                        static_cast<uint64_t>(Aoffset),
-                                        static_cast<uint64_t>(Boffset),
-                                        static_cast<uint64_t>(Coffset),
-                                        static_cast<uint64_t>(astrides[0]),
-                                        static_cast<uint64_t>(blens[0] == 1 ? 0 : bstrides[0]),
-                                        static_cast<uint64_t>(cstrides[0]),
-                                        miopen_alpha0,
-                                        miopen_alpha1,
-                                        miopen_beta,
-                                        static_cast<uint64_t>(clens[0]),
-                                        !float_equal(miopen_beta, 0.0));
-            }
-        }
-    });
-}
-
-void OpTensor(const Handle& handle,
-              miopenTensorOp_t tensorOp,
-              const void* alpha0,
-              const TensorDescriptor& aTensorDesc,
-              ConstData_t ATensor,
-              const void* alpha1,
-              const TensorDescriptor& bTensorDesc,
-              ConstData_t BTensor,
-              const void* beta,
-              const TensorDescriptor& cTensorDesc,
-              Data_t CTensor,
-              const size_t Aoffset,
-              const size_t Boffset,
-              const size_t Coffset,
-              bool nonStandardSquash)
-{
-    if(ATensor == nullptr || BTensor == nullptr || CTensor == nullptr)
-    {
-        MIOPEN_THROW(miopenStatusBadParm);
-    }
-
-    // if(aTensorDesc != cTensorDesc)
-    if(aTensorDesc.GetElementSize() != cTensorDesc.GetElementSize())
-    {
-        MIOPEN_THROW("A and C Tensors do not match");
-    }
-
-    if(bTensorDesc.GetType() != cTensorDesc.GetType())
-    {
-        MIOPEN_THROW("Datatypes for B and C tensors do not match !");
-    }
-
-    auto blens = bTensorDesc.GetLengths();
-#if(MIO_TENSOROCL_DEBUG == 1)
-    printf("blen:[");
-    for(auto len : blens)
-    {
-        printf(" %lu", len);
-    }
-    printf("]\n");
-#endif
-    auto clens = cTensorDesc.GetLengths();
-
-    if(clens.size() > 5)
-    {
-        MIOPEN_THROW("Tensor dimension larger than 5: " + std::to_string(clens.size()));
-    }
-
-    if(blens.size() != clens.size())
-    {
-        MIOPEN_THROW("Number of dims in B and C Tensors do not match: " +
-                     std::to_string(blens.size()) + ", " + std::to_string(clens.size()));
-    }
-
-    if(!nonStandardSquash)
-    {
-        for(std::size_t i = 0; i < clens.size(); i++)
-        {
-            if(blens[i] != 1 && blens[i] != clens[i])
-            {
-                MIOPEN_THROW("BTensor dim != 1 && BTensor dim != CTensor dim: " +
-                             std::to_string(i));
-            }
-        }
-    }
-    else
-    {
-        // non standard behavior because blens[1] can be not equalt to clens[1]
-        if(!(clens.size() == 3 && blens[0] == 1 && clens[0] == 1 && blens[2] == clens[2]))
-        {
-            MIOPEN_THROW("Non standard squashed operation supported only for 3d tensors and for "
-                         "the specific configuration");
-        }
-    }
-
-    auto bsize = blens.size();
-    if(bsize == 3)
-    {
-        OpTensor3d(handle,
-                   tensorOp,
-                   alpha0,
-                   aTensorDesc,
-                   ATensor,
-                   alpha1,
-                   bTensorDesc,
-                   BTensor,
-                   beta,
-                   cTensorDesc,
-                   CTensor,
-                   Aoffset,
-                   Boffset,
-                   Coffset,
-                   nonStandardSquash);
-    }
-    else if(bsize == 4)
-    {
-        OpTensor4d(handle,
-                   tensorOp,
-                   alpha0,
-                   aTensorDesc,
-                   ATensor,
-                   alpha1,
-                   bTensorDesc,
-                   BTensor,
-                   beta,
-                   cTensorDesc,
-                   CTensor,
-                   Aoffset,
-                   Boffset,
-                   Coffset);
-    }
-    else
-    {
-        OpTensorOther(handle,
-                      tensorOp,
-                      alpha0,
-                      aTensorDesc,
-                      ATensor,
-                      alpha1,
-                      bTensorDesc,
-                      BTensor,
-                      beta,
-                      cTensorDesc,
-                      CTensor,
-                      Aoffset,
-                      Boffset,
-                      Coffset);
-    }
-}
-
-struct two_exp_ceiling_t
-{
-    std::size_t operator()(std::size_t n) const
-    {
-        assert(n > 0);
-
-        std::size_t i = 1;
-
-        n--;
-        while(n != 0)
-        {
-            i *= 2;
-            n /= 2;
-        }
-
-        return i;
-    }
-};
-
-static std::vector<std::size_t> get_worker_sizes(const std::vector<std::size_t>& data_sizes)
-{
-    const std::size_t dim = data_sizes.size();
-
-    std::vector<std::size_t> worker_sizes(dim);
-
-    std::transform(data_sizes.begin(), data_sizes.end(), worker_sizes.begin(), two_exp_ceiling_t{});
-
-    std::size_t wgd = std::accumulate(
-        worker_sizes.begin(), worker_sizes.end(), std::size_t{1}, std::multiplies<std::size_t>());
-
-    if(wgd > 65536)
-    {
-        std::size_t n = wgd / 65536;
-
-        int i = 0;
-        while(n > 1 && i < dim)
-        {
-            std::size_t size_old = worker_sizes[i];
-            worker_sizes[i]      = (size_old - 1) / n + 1;
-            n /= size_old / worker_sizes[i];
-            ++i;
-        }
-    }
-
-    return worker_sizes;
-}
-
-void SetTensor(const Handle& handle,
-               const TensorDescriptor& yDesc,
-               Data_t y,
-               const void* alpha,
-               const int offset)
-{
-    if(y == nullptr || alpha == nullptr)
-    {
-        MIOPEN_THROW(miopenStatusBadParm);
-    }
-
-    const TensorDescriptor yDesc_flat = GetFlattenedTensorDescriptor(yDesc);
-
-#ifndef NDEBUG
-    if(yDesc.GetNumDims() != yDesc_flat.GetNumDims())
-    {
-        MIOPEN_LOG_I2("real descriptor: " << yDesc);
-        MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat);
-    }
-#endif
-
-    const std::size_t yDim_flat = yDesc_flat.GetNumDims();
-
-    assert(yDim_flat > 0 && yDim_flat <= 5);
-
-    std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d";
-
-    const miopenDataType_t dataType = yDesc_flat.GetType();
-
-    std::string network_config = "set " + std::to_string(dataType);
-    for(auto& len : yDesc_flat.GetLengths())
-    {
-        network_config += " " + std::to_string(len);
-    }
-
-    auto&& kernels = handle.GetKernels(kernel_name, network_config);
-
-    KernelInvoke kernel;
-
-    if(!kernels.empty())
-    {
-        kernel = kernels.front();
-    }
-    else
-    {
-        std::string program_name = "MIOpenSubTensorOpWithScalarKernel.cl";
-
-        std::vector<std::size_t> worker_sizes = get_worker_sizes(yDesc_flat.GetLengths());
-
-        std::size_t wgd = std::accumulate(worker_sizes.begin(),
-                                          worker_sizes.end(),
-                                          std::size_t{1},
-                                          std::multiplies<std::size_t>());
-
-        std::size_t wld = 256 < wgd ? 256 : wgd;
-        std::stringstream ss;
-        ss << "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET"
-           << GetDataTypeKernelParams(dataType);
-        for(int i = 0; i < yDim_flat; ++i)
-        {
-            ss << " -DWORK_LENGTH_" << std::to_string(i) << "=" << std::to_string(worker_sizes[i]);
-        }
-
-        kernel = handle.AddKernel(kernel_name,
-                                  network_config,
-                                  program_name,
-                                  kernel_name,
-                                  {wld, 1, 1},
-                                  {wgd, 1, 1},
-                                  ss.str());
-    }
-
-    switch(yDim_flat)
-    {
-    case 1: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]));
-        });
-
-        break;
-    }
-    case 2: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetStrides()[1]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[1]));
-        });
-
-        break;
-    }
-    case 3: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetStrides()[1]),
-                   static_cast<int>(yDesc_flat.GetStrides()[2]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[1]),
-                   static_cast<int>(yDesc_flat.GetLengths()[2]));
-        });
-
-        break;
-    }
-    case 4: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetStrides()[1]),
-                   static_cast<int>(yDesc_flat.GetStrides()[2]),
-                   static_cast<int>(yDesc_flat.GetStrides()[3]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[1]),
-                   static_cast<int>(yDesc_flat.GetLengths()[2]),
-                   static_cast<int>(yDesc_flat.GetLengths()[3]));
-        });
-
-        break;
-    }
-    case 5: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetStrides()[1]),
-                   static_cast<int>(yDesc_flat.GetStrides()[2]),
-                   static_cast<int>(yDesc_flat.GetStrides()[3]),
-                   static_cast<int>(yDesc_flat.GetStrides()[4]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[1]),
-                   static_cast<int>(yDesc_flat.GetLengths()[2]),
-                   static_cast<int>(yDesc_flat.GetLengths()[3]),
-                   static_cast<int>(yDesc_flat.GetLengths()[4]));
-        });
-
-        break;
-    }
-    default: assert(false);
-    }
-}
-
-void ScaleTensor(const Handle& handle,
-                 const TensorDescriptor& yDesc,
-                 Data_t y,
-                 const void* alpha,
-                 const int offset)
-{
-    if(y == nullptr || alpha == nullptr)
-    {
-        MIOPEN_THROW(miopenStatusBadParm);
-    }
-
-    const TensorDescriptor yDesc_flat = GetFlattenedTensorDescriptor(yDesc);
-
-#ifndef NDEBUG
-    if(yDesc.GetNumDims() != yDesc_flat.GetNumDims())
-    {
-        MIOPEN_LOG_I2("real descriptor: " << yDesc);
-        MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat);
-    }
-#endif
-
-    const std::size_t yDim_flat = yDesc_flat.GetNumDims();
-
-    assert(yDim_flat > 0 && yDim_flat <= 5);
-
-    const miopenDataType_t dataType = yDesc_flat.GetType();
-
-    if(!(dataType == miopenHalf     //
-         || dataType == miopenFloat //
-         || dataType == miopenInt32 //
-         || dataType == miopenDouble))
-    {
-        MIOPEN_THROW(miopenStatusBadParm, "ScaleTensor: unsupported data type.");
-    }
-
-    std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d";
-
-    const std::vector<std::size_t>& lens = yDesc_flat.GetLengths();
-
-    std::string network_config = "scale " + std::to_string(yDesc_flat.GetType());
-    for(auto& len : lens)
-    {
-        network_config += " " + std::to_string(len);
-    }
-
-    auto&& kernels = handle.GetKernels(kernel_name, network_config);
-
-    KernelInvoke kernel;
-
-    if(!kernels.empty())
-    {
-        kernel = kernels.front();
-    }
-    else
-    {
-        std::string program_name = "MIOpenSubTensorOpWithScalarKernel.cl";
-
-        std::vector<std::size_t> worker_sizes = get_worker_sizes(lens);
-
-        std::size_t wgd = std::accumulate(worker_sizes.begin(),
-                                          worker_sizes.end(),
-                                          std::size_t{1},
-                                          std::multiplies<std::size_t>());
-
-        std::size_t wld = 256 < wgd ? 256 : wgd;
-
-        std::string parms = "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_MULTIPLY" +
-                            GetDataTypeKernelParams(dataType);
-        for(int i = 0; i < yDim_flat; ++i)
-        {
-            parms += " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]);
-        }
-
-        kernel = handle.AddKernel(kernel_name,
-                                  network_config,
-                                  program_name,
-                                  kernel_name,
-                                  {wld, 1, 1},
-                                  {wgd, 1, 1},
-                                  parms);
-    }
-
-    switch(yDim_flat)
-    {
-    case 1: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]));
-        });
-
-        break;
-    }
-    case 2: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetStrides()[1]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[1]));
-        });
-
-        break;
-    }
-    case 3: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetStrides()[1]),
-                   static_cast<int>(yDesc_flat.GetStrides()[2]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[1]),
-                   static_cast<int>(yDesc_flat.GetLengths()[2]));
-        });
-
-        break;
-    }
-    case 4: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetStrides()[1]),
-                   static_cast<int>(yDesc_flat.GetStrides()[2]),
-                   static_cast<int>(yDesc_flat.GetStrides()[3]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[1]),
-                   static_cast<int>(yDesc_flat.GetLengths()[2]),
-                   static_cast<int>(yDesc_flat.GetLengths()[3]));
-        });
-
-        break;
-    }
-    case 5: {
-        visit_float(dataType, [&](auto as_float) {
-            kernel(y,
-                   *as_float(alpha),
-                   offset,
-                   static_cast<int>(yDesc_flat.GetStrides()[0]),
-                   static_cast<int>(yDesc_flat.GetStrides()[1]),
-                   static_cast<int>(yDesc_flat.GetStrides()[2]),
-                   static_cast<int>(yDesc_flat.GetStrides()[3]),
-                   static_cast<int>(yDesc_flat.GetStrides()[4]),
-                   static_cast<int>(yDesc_flat.GetLengths()[0]),
-                   static_cast<int>(yDesc_flat.GetLengths()[1]),
-                   static_cast<int>(yDesc_flat.GetLengths()[2]),
-                   static_cast<int>(yDesc_flat.GetLengths()[3]),
-                   static_cast<int>(yDesc_flat.GetLengths()[4]));
-        });
-
-        break;
-    }
-    default: assert(false);
-    }
-}
-
-void CopyTensor(const Handle& handle,
-                const TensorDescriptor& srcDesc,
-                ConstData_t src,
-                const TensorDescriptor& dstDesc,
-                Data_t dst,
-                int srcOffset,
-                int dstOffset,
-                bool forseAsync)
-{
-    if(src == nullptr || dst == nullptr)
-    {
-        MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor.");
-    }
-
-    if(srcDesc.GetType() != dstDesc.GetType())
-    {
-        MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match.");
-    }
-
-    if(srcDesc.GetLengths() != dstDesc.GetLengths())
-    {
-        MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match.");
-    }
-
-    auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc);
-    const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors);
-    const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors);
-
-#ifndef NDEBUG
-    if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims())
-    {
-        MIOPEN_LOG_I2("src real descriptor: " << srcDesc);
-        MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat);
-        MIOPEN_LOG_I2("dst real descriptor: " << dstDesc);
-        MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat);
-    }
-#endif
-
-    std::size_t srcDim_flat = srcDesc_flat.GetNumDims();
-
-    if(srcDim_flat < 1 || srcDim_flat > 5)
-    {
-        MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported.");
-    }
-
-    if(forseAsync || srcOffset > 0 || dstOffset > 0 ||
-       (!(srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked())))
-    {
-        std::string kernel_name = "SubTensorOpWithSubTensor" + std::to_string(srcDim_flat) + "d";
-
-        const std::vector<std::size_t>& lens = srcDesc_flat.GetLengths();
-
-        std::string network_config = "copy " + std::to_string(srcDesc_flat.GetType());
-        for(auto& len : lens)
-        {
-            network_config += " " + std::to_string(len);
-        }
-
-        auto&& kernels = handle.GetKernels(kernel_name, network_config);
-
-        KernelInvoke kernel;
-
-        if(!kernels.empty())
-        {
-            kernel = kernels.front();
-        }
-        else
-        {
-            std::string program_name = "MIOpenSubTensorOpWithSubTensorKernel.cl";
-
-            std::vector<std::size_t> worker_sizes = get_worker_sizes(lens);
-
-            std::size_t wgd = std::accumulate(worker_sizes.begin(),
-                                              worker_sizes.end(),
-                                              std::size_t{1},
-                                              std::multiplies<std::size_t>());
-
-            std::size_t wld = 256 < wgd ? 256 : wgd;
-
-            std::string parms = "-DSUBTENSOR_OP_WITH_SUBTENSOR=SUBTENSOR_OP_WITH_SUBTENSOR_COPY" +
-                                GetDataTypeKernelParams(srcDesc_flat.GetType());
-            for(std::size_t i = 0; i < srcDim_flat; ++i)
-            {
-                parms +=
-                    " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]);
-            }
-
-            kernel = handle.AddKernel(kernel_name,
-                                      network_config,
-                                      program_name,
-                                      kernel_name,
-                                      {wld, 1, 1},
-                                      {wgd, 1, 1},
-                                      parms);
-        }
-
-        switch(srcDim_flat)
-        {
-        case 1: {
-            kernel(src,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]));
-
-            break;
-        }
-        case 2: {
-            kernel(src,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[1]));
-
-            break;
-        }
-        case 3: {
-            kernel(src,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[2]));
-
-            break;
-        }
-        case 4: {
-            kernel(src,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[3]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[3]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[2]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[3]));
-
-            break;
-        }
-        case 5: {
-            kernel(src,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[3]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[4]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[3]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[4]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[2]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[3]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[4]));
-
-            break;
-        }
-        default: assert(false);
-        }
-    }
-    else
-    {
-        handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType()));
-    }
-}
-
-std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, miopenDataType_t type)
-{
-    std::string option(buildOption);
-    switch(type)
-    {
-    case miopenInt8: return option += "0";
-    case miopenInt32: return option += "1";
-    case miopenHalf: return option += "2";
-    case miopenFloat: return option += "3";
-    case miopenBFloat16: return option += "4";
-    case miopenFloat8:
-        MIOPEN_THROW(miopenStatusBadParm, "miopenFloat8 data type not supported in cast tensor.");
-    case miopenBFloat8:
-        MIOPEN_THROW(miopenStatusBadParm, "miopenBFloat8 data type not supported in cast tensor.");
-    case miopenDouble:
-        // TODO
-        MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
-    case miopenInt64:
-        MIOPEN_THROW(miopenStatusBadParm, "miopenInt64 data type not supported in cast tensor.");
-    default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc.");
-    }
-}
-
-void CastTensor(const Handle& handle,
-                const void* alpha,
-                const bool clamping,
-                const TensorDescriptor& srcDesc,
-                ConstData_t src,
-                const TensorDescriptor& dstDesc,
-                Data_t dst,
-                int srcOffset,
-                int dstOffset)
-{
-    if(src == nullptr || dst == nullptr)
-    {
-        MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor.");
-    }
-
-    if(srcDesc.GetLengths() != dstDesc.GetLengths())
-    {
-        MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match.");
-    }
-
-    auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc);
-    const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors);
-    const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors);
-
-#ifndef NDEBUG
-    if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims())
-    {
-        MIOPEN_LOG_I2("src real descriptor: " << srcDesc);
-        MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat);
-        MIOPEN_LOG_I2("dst real descriptor: " << dstDesc);
-        MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat);
-    }
-#endif
-
-    std::size_t srcDim_flat = srcDesc_flat.GetNumDims();
-
-    if(srcDim_flat < 1 || srcDim_flat > 5)
-    {
-        MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported.");
-    }
-
-    if(srcDesc.GetType() == dstDesc.GetType() && srcOffset == 0 && dstOffset == 0 &&
-       srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked())
-    {
-        handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType()));
-    }
-    else
-    {
-        std::string kernel_name = "SubTensorOpWithCastTensor" + std::to_string(srcDim_flat) + "d";
-
-        const std::vector<std::size_t>& lens = srcDesc_flat.GetLengths();
-
-        std::string network_config = "cast " + std::to_string(dstDesc_flat.GetType());
-        for(auto& len : lens)
-        {
-            network_config += " " + std::to_string(len);
-        }
-
-        auto&& kernels = handle.GetKernels(kernel_name, network_config);
-        KernelInvoke kernel;
-
-        auto miopen_alpha = *(static_cast<const float*>(alpha));
-
-        if(!kernels.empty())
-        {
-            kernel = kernels.front();
-        }
-        else
-        {
-            std::string program_name = "MIOpenSubTensorOpWithCastTensorKernel.cl";
-
-            std::vector<std::size_t> worker_sizes = get_worker_sizes(lens);
-
-            std::size_t wgd = std::accumulate(worker_sizes.begin(),
-                                              worker_sizes.end(),
-                                              std::size_t{1},
-                                              std::multiplies<std::size_t>());
-
-            std::size_t wld = 256 < wgd ? 256 : wgd;
-
-            std::string parms =
-                GetCastTensorBuildOptionFromType(" -DMIOPEN_SRC_TYPE=", srcDesc_flat.GetType()) +
-                GetCastTensorBuildOptionFromType(" -DMIOPEN_DST_TYPE=", dstDesc_flat.GetType());
-
-            for(std::size_t i = 0; i < srcDim_flat; ++i)
-            {
-                parms +=
-                    " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]);
-            }
-
-            if(dstDesc_flat.GetType() == miopenBFloat16)
-            {
-                parms += " -DMIOPEN_USE_RNE_BFLOAT16=1";
-            }
-
-            kernel = handle.AddKernel(kernel_name,
-                                      network_config,
-                                      program_name,
-                                      kernel_name,
-                                      {wld, 1, 1},
-                                      {wgd, 1, 1},
-                                      parms);
-        }
-
-        const int clamping_arg = clamping ? 1 : 0;
-        switch(srcDim_flat)
-        {
-        case 1: {
-            kernel(src,
-                   miopen_alpha,
-                   clamping_arg,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]));
-
-            break;
-        }
-        case 2: {
-            kernel(src,
-                   miopen_alpha,
-                   clamping_arg,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[1]));
-
-            break;
-        }
-        case 3: {
-            kernel(src,
-                   miopen_alpha,
-                   clamping_arg,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[2]));
-
-            break;
-        }
-        case 4: {
-            kernel(src,
-                   miopen_alpha,
-                   clamping_arg,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[3]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[3]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[2]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[3]));
-
-            break;
-        }
-        case 5: {
-            kernel(src,
-                   miopen_alpha,
-                   clamping_arg,
-                   srcOffset,
-                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[3]),
-                   static_cast<int>(srcDesc_flat.GetStrides()[4]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[3]),
-                   static_cast<int>(srcDesc_flat.GetLengths()[4]),
-                   dst,
-                   dstOffset,
-                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[2]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[3]),
-                   static_cast<int>(dstDesc_flat.GetStrides()[4]));
-
-            break;
-        }
-        default: assert(false);
-        }
-    }
-}
-
-void TransformTensor(const Handle& handle,
-                     const void* alpha,
-                     const TensorDescriptor& xDesc,
-                     ConstData_t x,
-                     const void* beta,
-                     const TensorDescriptor& yDesc,
-                     Data_t y,
-                     size_t Xoffset,
-                     size_t Yoffset)
-{
-    if(x == nullptr || y == nullptr)
-    {
-        MIOPEN_THROW(miopenStatusBadParm);
-    }
-
-    if(alpha == nullptr || beta == nullptr)
-    {
-        MIOPEN_THROW(miopenStatusBadParm);
-    }
-
-    auto x_len = xDesc.GetLengths();
-    auto y_len = yDesc.GetLengths();
-
-    if(x_len.size() != y_len.size())
-    {
-        MIOPEN_THROW("Tensor dimension must be the same");
-    }
-
-    if(x_len[0] != y_len[0])
-    {
-        MIOPEN_THROW("Tensor x and y batch sizes do not match");
-    }
-
-    const auto is_alpha_one = float_equal(*(static_cast<const float*>(alpha)), 1);
-    const auto is_beta_zero = float_equal(*(static_cast<const float*>(beta)), 0);
-
-    if(xDesc.GetType() == miopenInt8 && yDesc.GetType() == miopenInt8 && x_len.size() >= 3)
-    {
-        if(x_len[1] <= y_len[1])
-        {
-            if(x_len[1] <= (y_len[1] - 4) || y_len[1] % 4 != 0)
-            {
-                MIOPEN_THROW("Invalid y channel size");
-            }
-
-            int8_t zero = 0;
-            SetTensor(handle, yDesc, y, &zero);
-        }
-        else if(x_len[1] % 4 != 0)
-        {
-            MIOPEN_THROW("Invalid x channel size");
-        }
-
-        size_t batch_n = x_len[0];
-
-        x_len[0] = 1;
-        y_len[0] = 1;
-
-        miopen::TensorDescriptor x_batch_desc, y_batch_desc;
-        x_batch_desc = miopen::TensorDescriptor(miopenInt8, x_len);
-        y_batch_desc = miopen::TensorDescriptor(miopenInt8, y_len);
-
-        size_t x_batch_sz = x_batch_desc.GetElementSize();
-        size_t y_batch_sz = y_batch_desc.GetElementSize();
-
-        for(size_t i = 0; i < batch_n; i++)
-        {
-            size_t x_offset = i * x_batch_sz;
-            size_t y_offset = i * y_batch_sz;
-
-            if(is_alpha_one && is_beta_zero)
-            {
-                CopyTensor(handle,
-                           ((x_len[1] <= y_len[1]) ? x_batch_desc : y_batch_desc),
-                           x,
-                           ((x_len[1] <= y_len[1]) ? x_batch_desc : y_batch_desc),
-                           y,
-                           x_offset,
-                           y_offset);
-            }
-            else
-            {
-                MIOPEN_THROW(miopenStatusNotImplemented,
-                             "y=alpha*x+beta*y is not supported for int8 yet");
-            }
-        }
-    }
-    else
-    {
-        auto x_y_len          = boost::combine(x_len, y_len);
-        bool same_spatial_len = std::all_of(x_y_len.begin(), x_y_len.end(), [](auto v) {
-            return boost::get<0>(v) == boost::get<1>(v);
-        });
-
-        if(!same_spatial_len)
-        {
-            MIOPEN_THROW("Tensor x and y spatial sizes do not match");
-        }
-
-        auto flat_descriptors              = GetConsistentFlattenedTensorDescriptors(xDesc, yDesc);
-        const TensorDescriptor& xDesc_flat = std::get<0>(flat_descriptors);
-        const TensorDescriptor& yDesc_flat = std::get<1>(flat_descriptors);
-
-        if(xDesc.GetNumDims() != xDesc_flat.GetNumDims())
-        {
-            MIOPEN_LOG_I2("x real descriptor: " << xDesc);
-            MIOPEN_LOG_I2("x flat descriptor: " << xDesc_flat);
-        }
-
-        if(yDesc.GetNumDims() != yDesc_flat.GetNumDims())
-        {
-            MIOPEN_LOG_I2("y real descriptor: " << yDesc);
-            MIOPEN_LOG_I2("y flat descriptor: " << yDesc_flat);
-        }
-
-        const std::size_t yDim_flat = yDesc_flat.GetNumDims();
-
-        assert(yDim_flat > 0 && yDim_flat <= 5);
-
-        const miopenDataType_t dataTypex = xDesc_flat.GetType();
-        const miopenDataType_t dataTypey = yDesc_flat.GetType();
-
-        if(!(dataTypex == miopenHalf        //
-             || dataTypex == miopenFloat    //
-             || dataTypex == miopenInt32    //
-             || dataTypex == miopenBFloat16 //
-             || dataTypex == miopenDouble))
-        {
-            MIOPEN_THROW("Tensor x is a unsupported data type");
-        }
-
-        if(!(dataTypey == miopenHalf        //
-             || dataTypey == miopenFloat    //
-             || dataTypey == miopenInt32    //
-             || dataTypey == miopenBFloat16 //
-             || dataTypey == miopenDouble))
-        {
-            MIOPEN_THROW("Tensor y is a unsupported data type");
-        }
-
-        if(dataTypex != dataTypey)
-        {
-            MIOPEN_THROW("Tensor x and y have different data types");
-        }
-
-        std::string kernel_name = "SubTensorOpWithTransform" + std::to_string(yDim_flat) + "d";
-
-        const std::vector<std::size_t>& lens = yDesc_flat.GetLengths();
-
-        std::string network_config = "transform " + std::to_string(yDesc_flat.GetType());
-        for(auto& len : lens)
-        {
-            network_config += "x" + std::to_string(len);
-        }
-
-        if(is_beta_zero)
-            network_config += "xBETA_IS_ZERO";
-        if(is_alpha_one)
-            network_config += "xALPHA_IS_ONE";
-
-        auto&& kernels = handle.GetKernels(kernel_name, network_config);
-
-        KernelInvoke kernel;
-
-        if(!kernels.empty())
-        {
-            kernel = kernels.front();
-        }
-        else
-        {
-            std::string program_name = "MIOpenSubTensorOpWithTransformKernel.cl";
-
-            std::vector<std::size_t> worker_sizes = get_worker_sizes(lens);
-
-            std::size_t wgd = std::accumulate(worker_sizes.begin(),
-                                              worker_sizes.end(),
-                                              std::size_t{1},
-                                              std::multiplies<std::size_t>());
-
-            std::size_t wld = 256 < wgd ? 256 : wgd;
-
-            std::string parms =
-                GetDataTypeKernelParams(dataTypey)                                           //
-                + " -DMIOPEN_BETA_IS_ZERO=" + std::to_string(static_cast<int>(is_beta_zero)) //
-                + " -DMIOPEN_ALPHA_IS_ONE=" + std::to_string(static_cast<int>(is_alpha_one));
-
-            for(int i = 0; i < yDim_flat; ++i)
-            {
-                parms +=
-                    " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]);
-            }
-
-            kernel = handle.AddKernel(kernel_name,
-                                      network_config,
-                                      program_name,
-                                      kernel_name,
-                                      {wld, 1, 1},
-                                      {wgd, 1, 1},
-                                      parms);
-        }
-
-        switch(yDim_flat)
-        {
-        case 1: {
-            visit_float(dataTypey, [&](auto as_float) {
-                kernel(x,
-                       *as_float(alpha),
-                       y,
-                       *as_float(beta),
-                       static_cast<unsigned>(Xoffset),
-                       static_cast<unsigned>(Yoffset),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]));
-            });
-
-            break;
-        }
-        case 2: {
-            visit_float(dataTypey, [&](auto as_float) {
-                kernel(x,
-                       *as_float(alpha),
-                       y,
-                       *as_float(beta),
-                       static_cast<unsigned>(Xoffset),
-                       static_cast<unsigned>(Yoffset),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[1]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[1]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[1]));
-            });
-
-            break;
-        }
-        case 3: {
-            visit_float(dataTypey, [&](auto as_float) {
-                kernel(x,
-                       *as_float(alpha),
-                       y,
-                       *as_float(beta),
-                       static_cast<unsigned>(Xoffset),
-                       static_cast<unsigned>(Yoffset),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[1]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[2]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[1]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[2]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[1]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[2]));
-            });
-
-            break;
-        }
-        case 4: {
-            visit_float(dataTypey, [&](auto as_float) {
-                kernel(x,
-                       *as_float(alpha),
-                       y,
-                       *as_float(beta),
-                       static_cast<unsigned>(Xoffset),
-                       static_cast<unsigned>(Yoffset),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[1]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[2]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[3]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[1]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[2]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[3]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[1]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[2]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[3]));
-            });
-
-            break;
-        }
-        case 5: {
-            visit_float(dataTypey, [&](auto as_float) {
-                kernel(x,
-                       *as_float(alpha),
-                       y,
-                       *as_float(beta),
-                       static_cast<unsigned>(Xoffset),
-                       static_cast<unsigned>(Yoffset),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[1]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[2]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[3]),
-                       static_cast<unsigned>(xDesc_flat.GetStrides()[4]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[1]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[2]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[3]),
-                       static_cast<unsigned>(yDesc_flat.GetStrides()[4]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[1]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[2]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[3]),
-                       static_cast<unsigned>(yDesc_flat.GetLengths()[4]));
-            });
-
-            break;
-        }
-        default: assert(false);
-        }
-    }
-}
-
-} // namespace miopen
diff --git a/src/rnn/Solutions/Base/bw_data_modular.cpp b/src/rnn/Solutions/Base/bw_data_modular.cpp
index 04bbfd780e..0f840e98d2 100644
--- a/src/rnn/Solutions/Base/bw_data_modular.cpp
+++ b/src/rnn/Solutions/Base/bw_data_modular.cpp
@@ -62,7 +62,7 @@ void RNNBackwardDataModularAlgo::PrepareWriteBuffers(const Handle& handle,
     }
 }
 
-void RNNBackwardDataModularAlgo::PropDhy(const Handle& handle,
+void RNNBackwardDataModularAlgo::PropDhy(Handle& handle,
                                          ConstData_t dhy,
                                          Data_t workSpace,
                                          unsigned int layer,
@@ -295,7 +295,7 @@ void RNNBackwardDataModularAlgo::UpdateHStatePerTimeSeq(const Handle& handle,
     }
 }
 
-void RNNBackwardDataModularAlgo::PropDhxDcx(const Handle& handle,
+void RNNBackwardDataModularAlgo::PropDhxDcx(Handle& handle,
                                             ConstData_t w,
                                             Data_t dhx,
                                             Data_t dcx,
diff --git a/src/rnn/Solutions/Base/bw_weights_modular.cpp b/src/rnn/Solutions/Base/bw_weights_modular.cpp
index 598d002cb0..76a37e6630 100644
--- a/src/rnn/Solutions/Base/bw_weights_modular.cpp
+++ b/src/rnn/Solutions/Base/bw_weights_modular.cpp
@@ -32,7 +32,7 @@
 namespace miopen {
 
 namespace rnn_base {
-miopenStatus_t ReducAddBias(const miopen::Handle& handle,
+miopenStatus_t ReducAddBias(miopen::Handle& handle,
                             Data_t dw,
                             const Data_t workSpace,
                             const miopen::TensorDescriptor& dw_desc,
@@ -243,7 +243,7 @@ void RNNBackwardWeightsModularAlgo::HiddenXInputWeights(const Handle& handle,
 }
 
 void RNNBackwardWeightsModularAlgo::BiasUpdate(
-    const Handle& handle, Data_t dw, Data_t workSpace, size_t layer, size_t workSpaceSize) const
+    Handle& handle, Data_t dw, Data_t workSpace, size_t layer, size_t workSpaceSize) const
 {
     if(rnnDesc.biasMode != 0u)
     {
diff --git a/src/rnn/Solutions/Base/fw_data_modular.cpp b/src/rnn/Solutions/Base/fw_data_modular.cpp
index ca6d18d294..450ab0be8b 100644
--- a/src/rnn/Solutions/Base/fw_data_modular.cpp
+++ b/src/rnn/Solutions/Base/fw_data_modular.cpp
@@ -59,13 +59,13 @@ void RNNForwardDataModularAlgo::PrepareWriteBuffers(const Handle& handle,
     }
 }
 
-void RNNForwardDataModularAlgo::PropX(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const
+void RNNForwardDataModularAlgo::PropX(Handle& handle, const runtimeArgsFwd& runtimeArgs) const
 {
     const size_t gemm_batch_size = workspaceInfo.getGateBlockSize()[1];
     return PropX(handle, runtimeArgs, 0, gemm_batch_size);
 }
 
-void RNNForwardDataModularAlgo::PropX(const Handle& handle,
+void RNNForwardDataModularAlgo::PropX(Handle& handle,
                                       const runtimeArgsFwd& runtimeArgs,
                                       size_t gemm_batch_offset,
                                       size_t gemm_batch_size) const
@@ -188,8 +188,7 @@ void RNNForwardDataModularAlgo::PropHxCx(const Handle& handle,
     }
 }
 
-void RNNForwardDataModularAlgo::AddBias(const Handle& handle,
-                                        const runtimeArgsFwd& runtimeArgs) const
+void RNNForwardDataModularAlgo::AddBias(Handle& handle, const runtimeArgsFwd& runtimeArgs) const
 {
     if(rnnDesc.biasMode == miopenRNNNoBias)
         return;
diff --git a/src/rnn/Solutions/bwd_multi_stream.cpp b/src/rnn/Solutions/bwd_multi_stream.cpp
index 964c8d50fa..00f3dddc57 100644
--- a/src/rnn/Solutions/bwd_multi_stream.cpp
+++ b/src/rnn/Solutions/bwd_multi_stream.cpp
@@ -172,7 +172,7 @@ bool RNNModularMultiStreamBWD::ChunkDispatch(const runtimeArgsBwd& args,
                                              size_t chunk_layer_offset) const
 {
     constexpr auto seq_dir = rnn_base::SequenceDirection::Forward;
-    const Handle& handle   = *args.handle;
+    Handle& handle         = *args.handle;
 
     if(chunk_time_offset >= max_seq_len)
         return false;
diff --git a/src/rnn/Solutions/bww_multi_stream.cpp b/src/rnn/Solutions/bww_multi_stream.cpp
index 1f480afdea..f77ce128c3 100644
--- a/src/rnn/Solutions/bww_multi_stream.cpp
+++ b/src/rnn/Solutions/bww_multi_stream.cpp
@@ -38,7 +38,7 @@ void RNNModularMultiStreamBWWeights::PrologueDispatch(const runtimeArgsBww& args
     rnnAlgoModules.PrepareWriteBuffers(*args.handle, args.dw);
 }
 
-void RNNModularMultiStreamBWWeights::Compute(const Handle& handle,
+void RNNModularMultiStreamBWWeights::Compute(Handle& handle,
                                              ConstData_t x,
                                              ConstData_t hx,
                                              Data_t dw,
diff --git a/src/rnn/Solutions/bww_s_steam.cpp b/src/rnn/Solutions/bww_s_steam.cpp
index 736d8cfde3..fa81015f12 100644
--- a/src/rnn/Solutions/bww_s_steam.cpp
+++ b/src/rnn/Solutions/bww_s_steam.cpp
@@ -29,7 +29,7 @@ namespace miopen {
 
 namespace rnn_base {
 
-void RNNModularSingleStreamBWWeights::Compute(const Handle& handle,
+void RNNModularSingleStreamBWWeights::Compute(Handle& handle,
                                               ConstData_t x,
                                               ConstData_t hx,
                                               Data_t dw,
diff --git a/src/tensor.cpp b/src/tensor.cpp
index c1bd709267..bc2efae3bf 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -26,12 +26,19 @@
 #include <miopen/tensor.hpp>
 
 #include <miopen/errors.hpp>
+#include <miopen/float_equal.hpp>
 #include <miopen/logger.hpp>
 #include <miopen/tensor_layout.hpp>
 #include <miopen/handle.hpp>
+#include <miopen/tensor_ops.hpp>
+#include <miopen/datatype.hpp>
 #include <miopen/tensorOp/invoke_params.hpp>
 #include <miopen/tensorOp/solvers.hpp>
 #include <miopen/find_solution.hpp>
+#include <miopen/visit_float.hpp>
+#include <miopen/util.hpp>
+
+#include <boost/range/combine.hpp>
 
 #include <nlohmann/json.hpp>
 
@@ -872,21 +879,1162 @@ void from_json(const nlohmann::json& j, TensorDescriptor& descriptor)
     j.at("type").get_to(descriptor.type);
 }
 
-void OpTensor2(Handle& handle,
-               miopenTensorOp_t tensorOp,
-               const void* alpha0,
-               const TensorDescriptor& aTensorDesc,
-               ConstData_t ATensor,
-               const void* alpha1,
-               const TensorDescriptor& bTensorDesc,
-               ConstData_t BTensor,
-               const void* beta,
-               const TensorDescriptor& cTensorDesc,
-               Data_t CTensor,
-               const size_t Aoffset,
-               const size_t Boffset,
-               const size_t Coffset,
-               bool nonStandardSquash)
+TensorDescriptor GetFlattenedTensorDescriptor(const TensorDescriptor& desc)
+{
+    // is packed
+    if(desc.IsPacked())
+        return {desc.GetType(), {desc.GetElementSize()}, {static_cast<std::size_t>(1)}};
+
+    // start flattening tensor
+    std::vector<std::size_t> flat_lengths;
+    std::vector<std::size_t> flat_strides;
+
+    auto non1_length_strides = boost::combine(desc.GetLengths(), desc.GetStrides()) |
+                               boost::adaptors::filtered(f_length_is_not_1_t());
+
+    auto i               = non1_length_strides.begin();
+    std::size_t flat_len = boost::get<0>(*i);
+    auto i_previous      = i++;
+
+    // the 0-th dimension full-length doesn't matter
+    for(; i != non1_length_strides.end(); ++i)
+    {
+        std::size_t len             = boost::get<0>(*i);
+        std::size_t stride          = boost::get<1>(*i);
+        std::size_t previous_stride = boost::get<1>(*i_previous);
+        std::size_t full_len        = previous_stride / stride;
+
+        if(len == full_len)
+        {
+            flat_len *= len;
+        }
+        else
+        {
+            flat_lengths.push_back(flat_len);
+            flat_strides.push_back(previous_stride);
+            flat_len = len;
+        }
+        i_previous = i;
+    }
+    flat_lengths.push_back(flat_len);
+    flat_strides.push_back(boost::get<1>(*i_previous));
+
+    return {desc.GetType(), flat_lengths, flat_strides};
+}
+
+struct two_exp_ceiling_t
+{
+    std::size_t operator()(std::size_t n) const
+    {
+        assert(n > 0);
+
+        std::size_t i = 1;
+
+        n--;
+        while(n != 0)
+        {
+            i *= 2;
+            n /= 2;
+        }
+
+        return i;
+    }
+};
+
+static std::vector<std::size_t> get_worker_sizes(const std::vector<std::size_t>& data_sizes)
+{
+    const std::size_t dim = data_sizes.size();
+
+    std::vector<std::size_t> worker_sizes(dim);
+
+    std::transform(data_sizes.begin(), data_sizes.end(), worker_sizes.begin(), two_exp_ceiling_t{});
+
+    std::size_t wgd = std::accumulate(
+        worker_sizes.begin(), worker_sizes.end(), std::size_t{1}, std::multiplies<std::size_t>());
+
+    if(wgd > 65536)
+    {
+        std::size_t n = wgd / 65536;
+
+        int i = 0;
+        while(n > 1 && i < dim)
+        {
+            std::size_t size_old = worker_sizes[i];
+            worker_sizes[i]      = (size_old - 1) / n + 1;
+            n /= size_old / worker_sizes[i];
+            ++i;
+        }
+    }
+
+    return worker_sizes;
+}
+
+void SetTensor(const Handle& handle,
+               const TensorDescriptor& yDesc,
+               Data_t y,
+               const void* alpha,
+               const int offset)
+{
+    if(y == nullptr || alpha == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm);
+    }
+
+    const TensorDescriptor yDesc_flat = GetFlattenedTensorDescriptor(yDesc);
+
+#ifndef NDEBUG
+    if(yDesc.GetNumDims() != yDesc_flat.GetNumDims())
+    {
+        MIOPEN_LOG_I2("real descriptor: " << yDesc);
+        MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat);
+    }
+#endif
+
+    const std::size_t yDim_flat = yDesc_flat.GetNumDims();
+
+    assert(yDim_flat > 0 && yDim_flat <= 5);
+
+    std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d";
+
+    const miopenDataType_t dataType = yDesc_flat.GetType();
+
+    std::string network_config = "set " + std::to_string(dataType);
+    for(auto& len : yDesc_flat.GetLengths())
+    {
+        network_config += " " + std::to_string(len);
+    }
+
+    auto&& kernels = handle.GetKernels(kernel_name, network_config);
+
+    KernelInvoke kernel;
+
+    if(!kernels.empty())
+    {
+        kernel = kernels.front();
+    }
+    else
+    {
+        std::string program_name = "MIOpenSubTensorOpWithScalarKernel.cl";
+
+        std::vector<std::size_t> worker_sizes = get_worker_sizes(yDesc_flat.GetLengths());
+
+        std::size_t wgd = std::accumulate(worker_sizes.begin(),
+                                          worker_sizes.end(),
+                                          std::size_t{1},
+                                          std::multiplies<std::size_t>());
+
+        std::size_t wld = 256 < wgd ? 256 : wgd;
+        std::stringstream ss;
+        ss << "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET"
+           << GetDataTypeKernelParams(dataType);
+        for(int i = 0; i < yDim_flat; ++i)
+        {
+            ss << " -DWORK_LENGTH_" << std::to_string(i) << "=" << std::to_string(worker_sizes[i]);
+        }
+
+        kernel = handle.AddKernel(kernel_name,
+                                  network_config,
+                                  program_name,
+                                  kernel_name,
+                                  {wld, 1, 1},
+                                  {wgd, 1, 1},
+                                  ss.str());
+    }
+
+    switch(yDim_flat)
+    {
+    case 1: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]));
+        });
+
+        break;
+    }
+    case 2: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetStrides()[1]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[1]));
+        });
+
+        break;
+    }
+    case 3: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetStrides()[1]),
+                   static_cast<int>(yDesc_flat.GetStrides()[2]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[1]),
+                   static_cast<int>(yDesc_flat.GetLengths()[2]));
+        });
+
+        break;
+    }
+    case 4: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetStrides()[1]),
+                   static_cast<int>(yDesc_flat.GetStrides()[2]),
+                   static_cast<int>(yDesc_flat.GetStrides()[3]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[1]),
+                   static_cast<int>(yDesc_flat.GetLengths()[2]),
+                   static_cast<int>(yDesc_flat.GetLengths()[3]));
+        });
+
+        break;
+    }
+    case 5: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetStrides()[1]),
+                   static_cast<int>(yDesc_flat.GetStrides()[2]),
+                   static_cast<int>(yDesc_flat.GetStrides()[3]),
+                   static_cast<int>(yDesc_flat.GetStrides()[4]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[1]),
+                   static_cast<int>(yDesc_flat.GetLengths()[2]),
+                   static_cast<int>(yDesc_flat.GetLengths()[3]),
+                   static_cast<int>(yDesc_flat.GetLengths()[4]));
+        });
+
+        break;
+    }
+    default: assert(false);
+    }
+}
+
+void ScaleTensor(const Handle& handle,
+                 const TensorDescriptor& yDesc,
+                 Data_t y,
+                 const void* alpha,
+                 const int offset)
+{
+    if(y == nullptr || alpha == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm);
+    }
+
+    const TensorDescriptor yDesc_flat = GetFlattenedTensorDescriptor(yDesc);
+
+#ifndef NDEBUG
+    if(yDesc.GetNumDims() != yDesc_flat.GetNumDims())
+    {
+        MIOPEN_LOG_I2("real descriptor: " << yDesc);
+        MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat);
+    }
+#endif
+
+    const std::size_t yDim_flat = yDesc_flat.GetNumDims();
+
+    assert(yDim_flat > 0 && yDim_flat <= 5);
+
+    const miopenDataType_t dataType = yDesc_flat.GetType();
+
+    if(!(dataType == miopenHalf     //
+         || dataType == miopenFloat //
+         || dataType == miopenInt32 //
+         || dataType == miopenDouble))
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "ScaleTensor: unsupported data type.");
+    }
+
+    std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d";
+
+    const std::vector<std::size_t>& lens = yDesc_flat.GetLengths();
+
+    std::string network_config = "scale " + std::to_string(yDesc_flat.GetType());
+    for(auto& len : lens)
+    {
+        network_config += " " + std::to_string(len);
+    }
+
+    auto&& kernels = handle.GetKernels(kernel_name, network_config);
+
+    KernelInvoke kernel;
+
+    if(!kernels.empty())
+    {
+        kernel = kernels.front();
+    }
+    else
+    {
+        std::string program_name = "MIOpenSubTensorOpWithScalarKernel.cl";
+
+        std::vector<std::size_t> worker_sizes = get_worker_sizes(lens);
+
+        std::size_t wgd = std::accumulate(worker_sizes.begin(),
+                                          worker_sizes.end(),
+                                          std::size_t{1},
+                                          std::multiplies<std::size_t>());
+
+        std::size_t wld = 256 < wgd ? 256 : wgd;
+
+        std::string parms = "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_MULTIPLY" +
+                            GetDataTypeKernelParams(dataType);
+        for(int i = 0; i < yDim_flat; ++i)
+        {
+            parms += " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]);
+        }
+
+        kernel = handle.AddKernel(kernel_name,
+                                  network_config,
+                                  program_name,
+                                  kernel_name,
+                                  {wld, 1, 1},
+                                  {wgd, 1, 1},
+                                  parms);
+    }
+
+    switch(yDim_flat)
+    {
+    case 1: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]));
+        });
+
+        break;
+    }
+    case 2: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetStrides()[1]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[1]));
+        });
+
+        break;
+    }
+    case 3: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetStrides()[1]),
+                   static_cast<int>(yDesc_flat.GetStrides()[2]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[1]),
+                   static_cast<int>(yDesc_flat.GetLengths()[2]));
+        });
+
+        break;
+    }
+    case 4: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetStrides()[1]),
+                   static_cast<int>(yDesc_flat.GetStrides()[2]),
+                   static_cast<int>(yDesc_flat.GetStrides()[3]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[1]),
+                   static_cast<int>(yDesc_flat.GetLengths()[2]),
+                   static_cast<int>(yDesc_flat.GetLengths()[3]));
+        });
+
+        break;
+    }
+    case 5: {
+        visit_float(dataType, [&](auto as_float) {
+            kernel(y,
+                   *as_float(alpha),
+                   offset,
+                   static_cast<int>(yDesc_flat.GetStrides()[0]),
+                   static_cast<int>(yDesc_flat.GetStrides()[1]),
+                   static_cast<int>(yDesc_flat.GetStrides()[2]),
+                   static_cast<int>(yDesc_flat.GetStrides()[3]),
+                   static_cast<int>(yDesc_flat.GetStrides()[4]),
+                   static_cast<int>(yDesc_flat.GetLengths()[0]),
+                   static_cast<int>(yDesc_flat.GetLengths()[1]),
+                   static_cast<int>(yDesc_flat.GetLengths()[2]),
+                   static_cast<int>(yDesc_flat.GetLengths()[3]),
+                   static_cast<int>(yDesc_flat.GetLengths()[4]));
+        });
+
+        break;
+    }
+    default: assert(false);
+    }
+}
+
+void CopyTensor(const Handle& handle,
+                const TensorDescriptor& srcDesc,
+                ConstData_t src,
+                const TensorDescriptor& dstDesc,
+                Data_t dst,
+                int srcOffset,
+                int dstOffset,
+                bool forseAsync)
+{
+    if(src == nullptr || dst == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor.");
+    }
+
+    if(srcDesc.GetType() != dstDesc.GetType())
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match.");
+    }
+
+    if(srcDesc.GetLengths() != dstDesc.GetLengths())
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match.");
+    }
+
+    auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc);
+    const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors);
+    const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors);
+
+#ifndef NDEBUG
+    if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims())
+    {
+        MIOPEN_LOG_I2("src real descriptor: " << srcDesc);
+        MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat);
+        MIOPEN_LOG_I2("dst real descriptor: " << dstDesc);
+        MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat);
+    }
+#endif
+
+    std::size_t srcDim_flat = srcDesc_flat.GetNumDims();
+
+    if(srcDim_flat < 1 || srcDim_flat > 5)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported.");
+    }
+
+    if(forseAsync || srcOffset > 0 || dstOffset > 0 ||
+       (!(srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked())))
+    {
+        std::string kernel_name = "SubTensorOpWithSubTensor" + std::to_string(srcDim_flat) + "d";
+
+        const std::vector<std::size_t>& lens = srcDesc_flat.GetLengths();
+
+        std::string network_config = "copy " + std::to_string(srcDesc_flat.GetType());
+        for(auto& len : lens)
+        {
+            network_config += " " + std::to_string(len);
+        }
+
+        auto&& kernels = handle.GetKernels(kernel_name, network_config);
+
+        KernelInvoke kernel;
+
+        if(!kernels.empty())
+        {
+            kernel = kernels.front();
+        }
+        else
+        {
+            std::string program_name = "MIOpenSubTensorOpWithSubTensorKernel.cl";
+
+            std::vector<std::size_t> worker_sizes = get_worker_sizes(lens);
+
+            std::size_t wgd = std::accumulate(worker_sizes.begin(),
+                                              worker_sizes.end(),
+                                              std::size_t{1},
+                                              std::multiplies<std::size_t>());
+
+            std::size_t wld = 256 < wgd ? 256 : wgd;
+
+            std::string parms = "-DSUBTENSOR_OP_WITH_SUBTENSOR=SUBTENSOR_OP_WITH_SUBTENSOR_COPY" +
+                                GetDataTypeKernelParams(srcDesc_flat.GetType());
+            for(std::size_t i = 0; i < srcDim_flat; ++i)
+            {
+                parms +=
+                    " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]);
+            }
+
+            kernel = handle.AddKernel(kernel_name,
+                                      network_config,
+                                      program_name,
+                                      kernel_name,
+                                      {wld, 1, 1},
+                                      {wgd, 1, 1},
+                                      parms);
+        }
+
+        switch(srcDim_flat)
+        {
+        case 1: {
+            kernel(src,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]));
+
+            break;
+        }
+        case 2: {
+            kernel(src,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[1]));
+
+            break;
+        }
+        case 3: {
+            kernel(src,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[2]));
+
+            break;
+        }
+        case 4: {
+            kernel(src,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[3]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[3]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[2]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[3]));
+
+            break;
+        }
+        case 5: {
+            kernel(src,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[3]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[4]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[3]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[4]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[2]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[3]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[4]));
+
+            break;
+        }
+        default: assert(false);
+        }
+    }
+    else
+    {
+        handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType()));
+    }
+}
+
+std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, miopenDataType_t type)
+{
+    std::string option(buildOption);
+    switch(type)
+    {
+    case miopenInt8: return option += "0";
+    case miopenInt32: return option += "1";
+    case miopenHalf: return option += "2";
+    case miopenFloat: return option += "3";
+    case miopenBFloat16: return option += "4";
+    case miopenFloat8:
+        MIOPEN_THROW(miopenStatusBadParm, "miopenFloat8 data type not supported in cast tensor.");
+    case miopenBFloat8:
+        MIOPEN_THROW(miopenStatusBadParm, "miopenBFloat8 data type not supported in cast tensor.");
+    case miopenDouble:
+        // TODO
+        MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
+    case miopenInt64:
+        MIOPEN_THROW(miopenStatusBadParm, "miopenInt64 data type not supported in cast tensor.");
+    default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc.");
+    }
+}
+
+void CastTensor(const Handle& handle,
+                const void* alpha,
+                const bool clamping,
+                const TensorDescriptor& srcDesc,
+                ConstData_t src,
+                const TensorDescriptor& dstDesc,
+                Data_t dst,
+                int srcOffset,
+                int dstOffset)
+{
+    if(src == nullptr || dst == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor.");
+    }
+
+    if(srcDesc.GetLengths() != dstDesc.GetLengths())
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match.");
+    }
+
+    auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc);
+    const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors);
+    const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors);
+
+#ifndef NDEBUG
+    if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims())
+    {
+        MIOPEN_LOG_I2("src real descriptor: " << srcDesc);
+        MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat);
+        MIOPEN_LOG_I2("dst real descriptor: " << dstDesc);
+        MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat);
+    }
+#endif
+
+    std::size_t srcDim_flat = srcDesc_flat.GetNumDims();
+
+    if(srcDim_flat < 1 || srcDim_flat > 5)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported.");
+    }
+
+    if(srcDesc.GetType() == dstDesc.GetType() && srcOffset == 0 && dstOffset == 0 &&
+       srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked())
+    {
+        handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType()));
+    }
+    else
+    {
+        std::string kernel_name = "SubTensorOpWithCastTensor" + std::to_string(srcDim_flat) + "d";
+
+        const std::vector<std::size_t>& lens = srcDesc_flat.GetLengths();
+
+        std::string network_config = "cast " + std::to_string(dstDesc_flat.GetType());
+        for(auto& len : lens)
+        {
+            network_config += " " + std::to_string(len);
+        }
+
+        auto&& kernels = handle.GetKernels(kernel_name, network_config);
+        KernelInvoke kernel;
+
+        auto miopen_alpha = *(static_cast<const float*>(alpha));
+
+        if(!kernels.empty())
+        {
+            kernel = kernels.front();
+        }
+        else
+        {
+            std::string program_name = "MIOpenSubTensorOpWithCastTensorKernel.cl";
+
+            std::vector<std::size_t> worker_sizes = get_worker_sizes(lens);
+
+            std::size_t wgd = std::accumulate(worker_sizes.begin(),
+                                              worker_sizes.end(),
+                                              std::size_t{1},
+                                              std::multiplies<std::size_t>());
+
+            std::size_t wld = 256 < wgd ? 256 : wgd;
+
+            std::string parms =
+                GetCastTensorBuildOptionFromType(" -DMIOPEN_SRC_TYPE=", srcDesc_flat.GetType()) +
+                GetCastTensorBuildOptionFromType(" -DMIOPEN_DST_TYPE=", dstDesc_flat.GetType());
+
+            for(std::size_t i = 0; i < srcDim_flat; ++i)
+            {
+                parms +=
+                    " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]);
+            }
+
+            if(dstDesc_flat.GetType() == miopenBFloat16)
+            {
+                parms += " -DMIOPEN_USE_RNE_BFLOAT16=1";
+            }
+
+            kernel = handle.AddKernel(kernel_name,
+                                      network_config,
+                                      program_name,
+                                      kernel_name,
+                                      {wld, 1, 1},
+                                      {wgd, 1, 1},
+                                      parms);
+        }
+
+        const int clamping_arg = clamping ? 1 : 0;
+        switch(srcDim_flat)
+        {
+        case 1: {
+            kernel(src,
+                   miopen_alpha,
+                   clamping_arg,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]));
+
+            break;
+        }
+        case 2: {
+            kernel(src,
+                   miopen_alpha,
+                   clamping_arg,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[1]));
+
+            break;
+        }
+        case 3: {
+            kernel(src,
+                   miopen_alpha,
+                   clamping_arg,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[2]));
+
+            break;
+        }
+        case 4: {
+            kernel(src,
+                   miopen_alpha,
+                   clamping_arg,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[3]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[3]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[2]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[3]));
+
+            break;
+        }
+        case 5: {
+            kernel(src,
+                   miopen_alpha,
+                   clamping_arg,
+                   srcOffset,
+                   static_cast<int>(srcDesc_flat.GetStrides()[0]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[1]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[2]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[3]),
+                   static_cast<int>(srcDesc_flat.GetStrides()[4]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[0]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[1]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[2]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[3]),
+                   static_cast<int>(srcDesc_flat.GetLengths()[4]),
+                   dst,
+                   dstOffset,
+                   static_cast<int>(dstDesc_flat.GetStrides()[0]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[1]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[2]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[3]),
+                   static_cast<int>(dstDesc_flat.GetStrides()[4]));
+
+            break;
+        }
+        default: assert(false);
+        }
+    }
+}
+
+void TransformTensor(const Handle& handle,
+                     const void* alpha,
+                     const TensorDescriptor& xDesc,
+                     ConstData_t x,
+                     const void* beta,
+                     const TensorDescriptor& yDesc,
+                     Data_t y,
+                     size_t Xoffset,
+                     size_t Yoffset)
+{
+    if(x == nullptr || y == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm);
+    }
+
+    if(alpha == nullptr || beta == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm);
+    }
+
+    auto x_len = xDesc.GetLengths();
+    auto y_len = yDesc.GetLengths();
+
+    if(x_len.size() != y_len.size())
+    {
+        MIOPEN_THROW("Tensor dimension must be the same");
+    }
+
+    if(x_len[0] != y_len[0])
+    {
+        MIOPEN_THROW("Tensor x and y batch sizes do not match");
+    }
+
+    const auto is_alpha_one = float_equal(*(static_cast<const float*>(alpha)), 1);
+    const auto is_beta_zero = float_equal(*(static_cast<const float*>(beta)), 0);
+
+    if(xDesc.GetType() == miopenInt8 && yDesc.GetType() == miopenInt8 && x_len.size() >= 3)
+    {
+        if(x_len[1] <= y_len[1])
+        {
+            if(x_len[1] <= (y_len[1] - 4) || y_len[1] % 4 != 0)
+            {
+                MIOPEN_THROW("Invalid y channel size");
+            }
+
+            int8_t zero = 0;
+            SetTensor(handle, yDesc, y, &zero);
+        }
+        else if(x_len[1] % 4 != 0)
+        {
+            MIOPEN_THROW("Invalid x channel size");
+        }
+
+        size_t batch_n = x_len[0];
+
+        x_len[0] = 1;
+        y_len[0] = 1;
+
+        miopen::TensorDescriptor x_batch_desc, y_batch_desc;
+        x_batch_desc = miopen::TensorDescriptor(miopenInt8, x_len);
+        y_batch_desc = miopen::TensorDescriptor(miopenInt8, y_len);
+
+        size_t x_batch_sz = x_batch_desc.GetElementSize();
+        size_t y_batch_sz = y_batch_desc.GetElementSize();
+
+        for(size_t i = 0; i < batch_n; i++)
+        {
+            size_t x_offset = i * x_batch_sz;
+            size_t y_offset = i * y_batch_sz;
+
+            if(is_alpha_one && is_beta_zero)
+            {
+                CopyTensor(handle,
+                           ((x_len[1] <= y_len[1]) ? x_batch_desc : y_batch_desc),
+                           x,
+                           ((x_len[1] <= y_len[1]) ? x_batch_desc : y_batch_desc),
+                           y,
+                           x_offset,
+                           y_offset);
+            }
+            else
+            {
+                MIOPEN_THROW(miopenStatusNotImplemented,
+                             "y=alpha*x+beta*y is not supported for int8 yet");
+            }
+        }
+    }
+    else
+    {
+        auto x_y_len          = boost::combine(x_len, y_len);
+        bool same_spatial_len = std::all_of(x_y_len.begin(), x_y_len.end(), [](auto v) {
+            return boost::get<0>(v) == boost::get<1>(v);
+        });
+
+        if(!same_spatial_len)
+        {
+            MIOPEN_THROW("Tensor x and y spatial sizes do not match");
+        }
+
+        auto flat_descriptors              = GetConsistentFlattenedTensorDescriptors(xDesc, yDesc);
+        const TensorDescriptor& xDesc_flat = std::get<0>(flat_descriptors);
+        const TensorDescriptor& yDesc_flat = std::get<1>(flat_descriptors);
+
+        if(xDesc.GetNumDims() != xDesc_flat.GetNumDims())
+        {
+            MIOPEN_LOG_I2("x real descriptor: " << xDesc);
+            MIOPEN_LOG_I2("x flat descriptor: " << xDesc_flat);
+        }
+
+        if(yDesc.GetNumDims() != yDesc_flat.GetNumDims())
+        {
+            MIOPEN_LOG_I2("y real descriptor: " << yDesc);
+            MIOPEN_LOG_I2("y flat descriptor: " << yDesc_flat);
+        }
+
+        const std::size_t yDim_flat = yDesc_flat.GetNumDims();
+
+        assert(yDim_flat > 0 && yDim_flat <= 5);
+
+        const miopenDataType_t dataTypex = xDesc_flat.GetType();
+        const miopenDataType_t dataTypey = yDesc_flat.GetType();
+
+        if(!(dataTypex == miopenHalf        //
+             || dataTypex == miopenFloat    //
+             || dataTypex == miopenInt32    //
+             || dataTypex == miopenBFloat16 //
+             || dataTypex == miopenDouble))
+        {
+            MIOPEN_THROW("Tensor x is a unsupported data type");
+        }
+
+        if(!(dataTypey == miopenHalf        //
+             || dataTypey == miopenFloat    //
+             || dataTypey == miopenInt32    //
+             || dataTypey == miopenBFloat16 //
+             || dataTypey == miopenDouble))
+        {
+            MIOPEN_THROW("Tensor y is a unsupported data type");
+        }
+
+        if(dataTypex != dataTypey)
+        {
+            MIOPEN_THROW("Tensor x and y have different data types");
+        }
+
+        std::string kernel_name = "SubTensorOpWithTransform" + std::to_string(yDim_flat) + "d";
+
+        const std::vector<std::size_t>& lens = yDesc_flat.GetLengths();
+
+        std::string network_config = "transform " + std::to_string(yDesc_flat.GetType());
+        for(auto& len : lens)
+        {
+            network_config += "x" + std::to_string(len);
+        }
+
+        if(is_beta_zero)
+            network_config += "xBETA_IS_ZERO";
+        if(is_alpha_one)
+            network_config += "xALPHA_IS_ONE";
+
+        auto&& kernels = handle.GetKernels(kernel_name, network_config);
+
+        KernelInvoke kernel;
+
+        if(!kernels.empty())
+        {
+            kernel = kernels.front();
+        }
+        else
+        {
+            std::string program_name = "MIOpenSubTensorOpWithTransformKernel.cl";
+
+            std::vector<std::size_t> worker_sizes = get_worker_sizes(lens);
+
+            std::size_t wgd = std::accumulate(worker_sizes.begin(),
+                                              worker_sizes.end(),
+                                              std::size_t{1},
+                                              std::multiplies<std::size_t>());
+
+            std::size_t wld = 256 < wgd ? 256 : wgd;
+
+            std::string parms =
+                GetDataTypeKernelParams(dataTypey)                                           //
+                + " -DMIOPEN_BETA_IS_ZERO=" + std::to_string(static_cast<int>(is_beta_zero)) //
+                + " -DMIOPEN_ALPHA_IS_ONE=" + std::to_string(static_cast<int>(is_alpha_one));
+
+            for(int i = 0; i < yDim_flat; ++i)
+            {
+                parms +=
+                    " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]);
+            }
+
+            kernel = handle.AddKernel(kernel_name,
+                                      network_config,
+                                      program_name,
+                                      kernel_name,
+                                      {wld, 1, 1},
+                                      {wgd, 1, 1},
+                                      parms);
+        }
+
+        switch(yDim_flat)
+        {
+        case 1: {
+            visit_float(dataTypey, [&](auto as_float) {
+                kernel(x,
+                       *as_float(alpha),
+                       y,
+                       *as_float(beta),
+                       static_cast<unsigned>(Xoffset),
+                       static_cast<unsigned>(Yoffset),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]));
+            });
+
+            break;
+        }
+        case 2: {
+            visit_float(dataTypey, [&](auto as_float) {
+                kernel(x,
+                       *as_float(alpha),
+                       y,
+                       *as_float(beta),
+                       static_cast<unsigned>(Xoffset),
+                       static_cast<unsigned>(Yoffset),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[1]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[1]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[1]));
+            });
+
+            break;
+        }
+        case 3: {
+            visit_float(dataTypey, [&](auto as_float) {
+                kernel(x,
+                       *as_float(alpha),
+                       y,
+                       *as_float(beta),
+                       static_cast<unsigned>(Xoffset),
+                       static_cast<unsigned>(Yoffset),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[1]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[2]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[1]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[2]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[1]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[2]));
+            });
+
+            break;
+        }
+        case 4: {
+            visit_float(dataTypey, [&](auto as_float) {
+                kernel(x,
+                       *as_float(alpha),
+                       y,
+                       *as_float(beta),
+                       static_cast<unsigned>(Xoffset),
+                       static_cast<unsigned>(Yoffset),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[1]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[2]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[3]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[1]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[2]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[3]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[1]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[2]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[3]));
+            });
+
+            break;
+        }
+        case 5: {
+            visit_float(dataTypey, [&](auto as_float) {
+                kernel(x,
+                       *as_float(alpha),
+                       y,
+                       *as_float(beta),
+                       static_cast<unsigned>(Xoffset),
+                       static_cast<unsigned>(Yoffset),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[1]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[2]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[3]),
+                       static_cast<unsigned>(xDesc_flat.GetStrides()[4]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[1]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[2]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[3]),
+                       static_cast<unsigned>(yDesc_flat.GetStrides()[4]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[0]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[1]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[2]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[3]),
+                       static_cast<unsigned>(yDesc_flat.GetLengths()[4]));
+            });
+
+            break;
+        }
+        default: assert(false);
+        }
+    }
+}
+
+void OpTensor(Handle& handle,
+              miopenTensorOp_t tensorOp,
+              const void* alpha0,
+              const TensorDescriptor& aTensorDesc,
+              ConstData_t ATensor,
+              const void* alpha1,
+              const TensorDescriptor& bTensorDesc,
+              ConstData_t BTensor,
+              const void* beta,
+              const TensorDescriptor& cTensorDesc,
+              Data_t CTensor,
+              const size_t Aoffset,
+              const size_t Boffset,
+              const size_t Coffset,
+              bool nonStandardSquash)
 {
     if(ATensor == nullptr || BTensor == nullptr || CTensor == nullptr)
     {
diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp
index 1df83044b2..3121715e8a 100644
--- a/test/tensor_ops.cpp
+++ b/test/tensor_ops.cpp
@@ -181,24 +181,24 @@ struct verify_tensor_ops
         auto a_dev = handle.Write(a.data);
         auto b_dev = handle.Write(b.data);
 
-        miopen::OpTensor2(handle,
-                          // miopenTensorOpAdd,
-                          // miopenTensorOpMax,
-                          // miopenTensorOpMin,
-                          miopenTensorOpMul,
-                          &alpha0,
-                          a.desc,
-                          a_dev.get(),
-                          &alpha1,
-                          b.desc,
-                          b_dev.get(),
-                          &beta,
-                          c.desc,
-                          c_dev.get(),
-                          Aoffset,
-                          Boffset,
-                          Coffset,
-                          false); // it does not verify non-standard behaviour
+        miopen::OpTensor(handle,
+                         // miopenTensorOpAdd,
+                         // miopenTensorOpMax,
+                         // miopenTensorOpMin,
+                         miopenTensorOpMul,
+                         &alpha0,
+                         a.desc,
+                         a_dev.get(),
+                         &alpha1,
+                         b.desc,
+                         b_dev.get(),
+                         &beta,
+                         c.desc,
+                         c_dev.get(),
+                         Aoffset,
+                         Boffset,
+                         Coffset,
+                         false); // it does not verify non-standard behaviour
 
         if(not no_validate)
         {

From 155b35fae50ba2f816203db49ae6cb3d3866483c Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Wed, 20 Nov 2024 21:04:09 +0200
Subject: [PATCH 22/25] code tidying

---
 src/solver/tensorOp/Op1dTensorGeneric.cpp | 1 -
 src/solver/tensorOp/Op2dTensorGeneric.cpp | 1 -
 src/solver/tensorOp/Op3dTensorGeneric.cpp | 9 ++++-----
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
index 896d75d50c..341e074d89 100644
--- a/src/solver/tensorOp/Op1dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -89,7 +89,6 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     size_t max_num_wg    = 4096;
 
     auto num_wg           = std::clamp(c_n / local_threads, size_t(1), size_t(max_num_wg));
-    num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
     size_t global_threads = num_wg * local_threads;
 
     const std::array<size_t, 3> vld{local_threads, 1, 1};
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
index 41fca78068..35c9629ba7 100644
--- a/src/solver/tensorOp/Op2dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -93,7 +93,6 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
     size_t max_num_wg    = 4096;
 
     auto num_wg = std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg));
-    num_wg      = num_wg > max_num_wg ? max_num_wg : num_wg;
     size_t global_threads = num_wg * local_threads;
 
     const std::array<size_t, 3> vld{local_threads, 1, 1};
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
index 2bafc6abaa..782eb1804f 100644
--- a/src/solver/tensorOp/Op3dTensorGeneric.cpp
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -86,12 +86,11 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
     miopenDataType_t data_type = bTensorDesc.GetType();
 
-    auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
+    size_t local_threads = 32;
+    size_t max_num_wg    = 4096;
 
-    int max_num_wg = 4096;
-    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
-
-    size_t local_threads  = 256;
+    auto num_wg =
+        std::clamp((clens[0] * clens[1] * clens[2]) / local_threads, size_t(1), size_t(max_num_wg));
     size_t global_threads = num_wg * local_threads;
 
     const std::array<size_t, 3> vld{local_threads, 1, 1};

From 0b3454c678ee60ad7605d324c3979af8310193b2 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Mon, 25 Nov 2024 17:00:24 +0200
Subject: [PATCH 23/25] unit test for tensorOp PD + additional changes
 requested

---
 src/CMakeLists.txt                            |   6 +-
 src/solver/tensorOp/tensor_op_helpers.hpp     |   5 -
 .../unit_tensorOp_ProblemDescription.cpp      | 200 ++++++++++++++++++
 3 files changed, 203 insertions(+), 8 deletions(-)
 create mode 100644 test/gtest/unit_tensorOp_ProblemDescription.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0721efc4f3..c9f1ab511a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -342,11 +342,11 @@ set( MIOpen_Source
     solver/tensorOp/Op2dTensorLite.cpp
     solver/tensorOp/Op2dTensorSquash.cpp
     solver/tensorOp/Op3dTensorGeneric.cpp
-    solver/tensorOp/OpTensorFwdBias.cpp
-    solver/tensorOp/Op4dTensorLite.cpp
-    solver/tensorOp/OpTensorLeadingOnes.cpp
     solver/tensorOp/Op4dTensorGeneric.cpp
+    solver/tensorOp/Op4dTensorLite.cpp
     solver/tensorOp/Op5dTensorGeneric.cpp
+    solver/tensorOp/OpTensorFwdBias.cpp
+    solver/tensorOp/OpTensorLeadingOnes.cpp
     subbuffers.cpp
     t5layernorm_api.cpp
     target_properties.cpp
diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp
index 26a9ac42d0..cf46c6efe8 100644
--- a/src/solver/tensorOp/tensor_op_helpers.hpp
+++ b/src/solver/tensorOp/tensor_op_helpers.hpp
@@ -193,13 +193,8 @@ Get4dParams(const miopen::tensorOp::ProblemDescription& problem, bool is4dLite)
 
     if(is4dLite)
     {
-        // for naive tensor ops
-        const std::string data_type = GetDataType(bTensorDesc.GetType());
-
         size_t TENS_LEN = cTensorDesc.GetElementSize();
         size_t RD_BLCK  = (TENS_LEN % 4 == 0) ? 4 : (TENS_LEN % 2 == 0) ? 2 : 1;
-        const std::string READ_TYPE =
-            (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
 
         size_t total_work = std::max(TENS_LEN / RD_BLCK, size_t(1));
         size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
diff --git a/test/gtest/unit_tensorOp_ProblemDescription.cpp b/test/gtest/unit_tensorOp_ProblemDescription.cpp
new file mode 100644
index 0000000000..1b02382881
--- /dev/null
+++ b/test/gtest/unit_tensorOp_ProblemDescription.cpp
@@ -0,0 +1,200 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <gtest/gtest.h>
+#include <miopen/tensorOp/problem_description.hpp>
+
+#include "unit_TensorDescriptor.hpp"
+#include <miopen/float_equal.hpp>
+
+namespace {
+
+struct TensorOpProblemDescriptionTestCase
+{
+    miopenTensorOp_t tensorOp;
+    float beta;
+    miopen::unit_tests::TensorDescriptorParams aTensorDesc;
+    miopen::unit_tests::TensorDescriptorParams bTensorDesc;
+    miopen::unit_tests::TensorDescriptorParams cTensorDesc;
+    bool nonStandardSquash;
+    bool isOk;
+
+    friend std::ostream& operator<<(std::ostream& os, const TensorOpProblemDescriptionTestCase& tc)
+    {
+        std::string op;
+        switch(tc.tensorOp)
+        {
+        case miopenTensorOpAdd: op.append("miopenTensorOpAdd"); break;
+        case miopenTensorOpMul: op.append("miopenTensorOpMul"); break;
+        case miopenTensorOpMin: op.append("miopenTensorOpMin"); break;
+        case miopenTensorOpMax: op.append("miopenTensorOpMax"); break;
+
+        default: break;
+        }
+
+        os << "(" << tc.aTensorDesc << "), ";
+        os << "(" << tc.bTensorDesc << "), ";
+        os << "(" << tc.cTensorDesc << "), \n";
+        os << "(" << op << ") - beta ";
+        os << std::to_string(tc.beta) << ")\n";
+        return os;
+    }
+};
+
+class TestTensorOpPD : public ::testing::TestWithParam<TensorOpProblemDescriptionTestCase>
+{
+public:
+    static auto GetTestCases()
+    {
+        using TestCase = TensorOpProblemDescriptionTestCase;
+
+        return std::vector{
+            // clang-format off
+            // 4D
+            TestCase{
+                miopenTensorOpAdd,          // tensorOp
+                0.0f,                       // beta
+                {miopenHalf, {1, 4, 4, 4}}, // A
+                {miopenHalf, {1, 4, 4, 4}}, // B
+                {miopenHalf, {1, 4, 4, 4}}, // C
+                false,                      // nonStandardSquash
+                true                        // isOk
+            },
+            TestCase{
+                miopenTensorOpAdd,          // tensorOp
+                0.0f,                       // beta
+                {miopenHalf, {4, 4, 4}},    // A
+                {miopenHalf, {1, 1, 4}},    // B
+                {miopenHalf, {4, 4, 4}},    // C
+                false,                      // nonStandardSquash
+                false                       // isOk
+            },
+            TestCase{
+                miopenTensorOpAdd,          // tensorOp
+                1.0f,                       // beta
+                {miopenHalf, {4, 1, 4}},    // A
+                {miopenHalf, {1, 1, 4}},    // B
+                {miopenHalf, {4, 4, 4}},    // C
+                false,                      // nonStandardSquash
+                false                       // isOk
+            },
+            TestCase{
+                miopenTensorOpAdd,          // tensorOp
+                1.0f,                       // beta
+                {miopenHalf, {4, 4, 4}},    // A
+                {miopenHalf, {1, 1, 4}},    // B
+                {miopenFloat, {4, 4, 4}},   // C
+                false,                      // nonStandardSquash
+                false                       // isOk
+            },
+            TestCase{
+                miopenTensorOpAdd,          // tensorOp
+                1.0f,                       // beta
+                {miopenHalf, {4, 4, 4, 4, 4, 4}},// A
+                {miopenHalf, {1, 1, 4}},    // B
+                {miopenHalf, {4, 4, 4, 4, 4, 4}},// C
+                false,                      // nonStandardSquash
+                false                       // isOk
+            },
+            TestCase{
+                miopenTensorOpAdd,          // tensorOp
+                1.0f,                       // beta
+                {miopenHalf, {4, 4, 4}},    // A
+                {miopenHalf, {1, 4}},       // B
+                {miopenHalf, {4, 4, 4}},    // C
+                false,                      // nonStandardSquash
+                false                       // isOk
+            },
+            TestCase{
+                miopenTensorOpAdd,          // tensorOp
+                1.0f,                        // beta
+                {miopenHalf, {4, 4, 4}},    // A
+                {miopenHalf, {1, 1, 5}},    // B
+                {miopenHalf, {4, 4, 4}},    // C
+                false,                      // nonStandardSquash
+                false                       // isOk
+            },
+            TestCase{
+                miopenTensorOpAdd,          // tensorOp
+                1.0f,                        // beta
+                {miopenHalf, {4, 4, 4, 4}},    // A
+                {miopenHalf, {1, 1, 4, 4}},    // B
+                {miopenHalf, {4, 4, 4, 4}},    // C
+                true,                      // nonStandardSquash
+                false                       // isOk
+            },
+            TestCase{
+                miopenTensorOpAdd,          // tensorOp
+                1.0f,                        // beta
+                {miopenHalf, {1, 4, 2}},    // A
+                {miopenHalf, {1, 1, 4}},    // B
+                {miopenHalf, {1, 4, 2}},    // C
+                true,                      // nonStandardSquash
+                false                       // isOk
+            }
+            // clang-format on
+        };
+    }
+
+    void RunTest()
+    {
+        const auto p = GetParam();
+
+        if(p.isOk)
+        {
+            const auto pd =
+                miopen::tensorOp::ProblemDescription{p.tensorOp,
+                                                     static_cast<const void*>(&p.beta),
+                                                     p.aTensorDesc.GetTensorDescriptor(),
+                                                     p.bTensorDesc.GetTensorDescriptor(),
+                                                     p.cTensorDesc.GetTensorDescriptor(),
+                                                     p.nonStandardSquash};
+            ASSERT_EQ(pd.GetBeta(), p.beta);
+        }
+        else
+        {
+            ASSERT_ANY_THROW({
+                const auto pd = miopen::tensorOp::ProblemDescription(
+                    p.tensorOp,
+                    miopen::float_equal(p.beta, 0.0) ? nullptr : static_cast<const void*>(&p.beta),
+                    p.aTensorDesc.GetTensorDescriptor(),
+                    p.bTensorDesc.GetTensorDescriptor(),
+                    p.cTensorDesc.GetTensorDescriptor(),
+                    p.nonStandardSquash);
+            });
+        }
+    }
+};
+
+} // namespace
+
+using CPU_TensorOpProblemDescription_NONE = TestTensorOpPD;
+
+TEST_P(CPU_TensorOpProblemDescription_NONE, TensorOpProblemDescription) { this->RunTest(); };
+
+INSTANTIATE_TEST_SUITE_P(Full,
+                         CPU_TensorOpProblemDescription_NONE,
+                         testing::ValuesIn(TestTensorOpPD::GetTestCases()));

From 146070a40f2cd826e070a037a0f4139c44d79044 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Fri, 29 Nov 2024 10:37:27 +0200
Subject: [PATCH 24/25] fix windows build issue

---
 src/include/miopen/tensorOp/problem_description.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/miopen/tensorOp/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp
index dc60a3c7c9..ecbf189b3f 100644
--- a/src/include/miopen/tensorOp/problem_description.hpp
+++ b/src/include/miopen/tensorOp/problem_description.hpp
@@ -35,7 +35,7 @@ struct NetworkConfig;
 
 namespace tensorOp {
 
-struct ProblemDescription : ProblemDescriptionBase
+struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
 {
     ProblemDescription(const miopenTensorOp_t tensorOp_,
                        const void* beta_,

From 3dc0f66d3b7bc1ed4e2cd57af05c08f2bec8d676 Mon Sep 17 00:00:00 2001
From: novakovicdj <novakovicdj27@gmail.com>
Date: Thu, 5 Dec 2024 17:25:04 +0200
Subject: [PATCH 25/25] kept changes in CastTensor but in tensor.cpp file

---
 src/tensor.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/tensor.cpp b/src/tensor.cpp
index 9215a40665..000a5ba87e 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -1538,8 +1538,10 @@ void CastTensor(const Handle& handle,
         MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported.");
     }
 
+    auto miopen_alpha = *(static_cast<const float*>(alpha));
+
     if(srcDesc.GetType() == dstDesc.GetType() && srcOffset == 0 && dstOffset == 0 &&
-       srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked())
+       srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked() && float_equal(miopen_alpha, 1.0))
     {
         handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType()));
     }
@@ -1549,7 +1551,9 @@ void CastTensor(const Handle& handle,
 
         const std::vector<std::size_t>& lens = srcDesc_flat.GetLengths();
 
-        std::string network_config = "cast " + std::to_string(dstDesc_flat.GetType());
+        // TODO: make proper network config
+        std::string network_config = "cast " + std::to_string(srcDesc_flat.GetType()) +
+                                     std::to_string(dstDesc_flat.GetType());
         for(auto& len : lens)
         {
             network_config += " " + std::to_string(len);
@@ -1558,8 +1562,6 @@ void CastTensor(const Handle& handle,
         auto&& kernels = handle.GetKernels(kernel_name, network_config);
         KernelInvoke kernel;
 
-        auto miopen_alpha = *(static_cast<const float*>(alpha));
-
         if(!kernels.empty())
         {
             kernel = kernels.front();