From a2972c08ab4bc53ea70ec78c53d784f166ce946f Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Wed, 30 Oct 2024 16:17:08 +0200 Subject: [PATCH 01/25] initial changes and support for 1d generic kernel --- src/CMakeLists.txt | 2 + src/include/miopen/tensor/invoke_params.hpp | 97 +++++++++ .../miopen/tensor/problem_description.hpp | 152 ++++++++++++++ src/include/miopen/tensor/solvers.hpp | 62 ++++++ src/include/miopen/tensor_ops.hpp | 16 ++ src/solver/tensor/TensorOp1dGeneric.cpp | 194 ++++++++++++++++++ src/tensor.cpp | 53 +++++ src/tensor/problem_description.cpp | 67 ++++++ test/tensor_ops.cpp | 42 ++-- 9 files changed, 664 insertions(+), 21 deletions(-) create mode 100644 src/include/miopen/tensor/invoke_params.hpp create mode 100644 src/include/miopen/tensor/problem_description.hpp create mode 100644 src/include/miopen/tensor/solvers.hpp create mode 100644 src/solver/tensor/TensorOp1dGeneric.cpp create mode 100644 src/tensor/problem_description.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 92e4f4264a..4a92727cfd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -337,11 +337,13 @@ set( MIOpen_Source solver/softmarginloss/forward_softmarginloss.cpp solver/softmax/attn_softmax.cpp solver/softmax/softmax.cpp + solver/tensor/TensorOp1dGeneric.cpp subbuffers.cpp t5layernorm_api.cpp target_properties.cpp temp_file.cpp tensor.cpp + tensor/problem_description.cpp tensor_api.cpp transformers_adam_w_api.cpp seq_tensor.cpp diff --git a/src/include/miopen/tensor/invoke_params.hpp b/src/include/miopen/tensor/invoke_params.hpp new file mode 100644 index 0000000000..68c96fda3f --- /dev/null +++ b/src/include/miopen/tensor/invoke_params.hpp @@ -0,0 +1,97 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +namespace miopen { + +namespace tensor { + +struct InvokeParams : public miopen::InvokeParams +{ + InvokeParams(miopenTensorOp_t tensorOp_, + const void* alpha0_, + const TensorDescriptor& aTensorDesc_, + ConstData_t ATensor_, + const void* alpha1_, + const TensorDescriptor& bTensorDesc_, + ConstData_t BTensor_, + const void* beta_, + const TensorDescriptor& cTensorDesc_, + Data_t CTensor_, + const size_t Aoffset_, + const size_t Boffset_, + const size_t Coffset_, + const bool nonStandardSquash_) + : alpha0(alpha0_), + alpha1(alpha1_), + beta(beta_), + tensorOp(tensorOp_), + aTensorDesc(aTensorDesc_), + ATensor(ATensor_), + bTensorDesc(bTensorDesc_), + BTensor(BTensor_), + cTensorDesc(cTensorDesc_), + CTensor(CTensor_), + Aoffset(Aoffset_), + Boffset(Boffset_), + Coffset(Coffset_), + nonStandardSquash(nonStandardSquash_) + { + } + + size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } + +public: + const void* alpha0; + const void* alpha1; + const void* beta; + + miopenTensorOp_t tensorOp; + + TensorDescriptor aTensorDesc; + ConstData_t ATensor; + + TensorDescriptor bTensorDesc; + ConstData_t BTensor; + + TensorDescriptor cTensorDesc; + Data_t CTensor; + + size_t Aoffset; + size_t Boffset; + size_t Coffset; + + bool nonStandardSquash; +}; + +} // namespace tensor + +} // namespace miopen diff --git a/src/include/miopen/tensor/problem_description.hpp b/src/include/miopen/tensor/problem_description.hpp new file mode 100644 index 0000000000..41a9e4d848 --- /dev/null +++ b/src/include/miopen/tensor/problem_description.hpp @@ -0,0 +1,152 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace tensor { + +struct ProblemDescription : ProblemDescriptionBase +{ + ProblemDescription(const miopenTensorOp_t tensorOp_, + const void* alpha0_, + const void* alpha1_, + const void* beta_, + const TensorDescriptor& aTensorDesc_, + const TensorDescriptor& bTensorDesc_, + const TensorDescriptor& cTensorDesc_, + const bool nonStandardSquash_) + : tensorOp(tensorOp_), + aTensorDesc(aTensorDesc_), + bTensorDesc(bTensorDesc_), + cTensorDesc(cTensorDesc_), + nonStandardSquash(nonStandardSquash_) + { + CheckAndAssignAlphaBeta(alpha0_, alpha1_, beta_); + + if(aTensorDesc.GetElementSize() != cTensorDesc.GetElementSize()) + { + MIOPEN_THROW("A and C Tensors do not match"); + } + + if(bTensorDesc.GetType() != cTensorDesc.GetType()) + { + MIOPEN_THROW("Datatypes for B and C tensors do not match !"); + } + + auto blens = bTensorDesc.GetLengths(); + auto clens = cTensorDesc.GetLengths(); + if(clens.size() > 5) + { + MIOPEN_THROW("Tensor dimension larger than 5: " + std::to_string(clens.size())); + } + + if(blens.size() != clens.size()) + { + MIOPEN_THROW("Number of dims in B and C Tensors do not match: " + + std::to_string(blens.size()) + ", " + std::to_string(clens.size())); + } + + if(!nonStandardSquash) + { + for(std::size_t i = 0; i < clens.size(); i++) + { + if(blens[i] != 1 && blens[i] != clens[i]) + { + MIOPEN_THROW("BTensor dim != 1 && BTensor dim != CTensor dim: " + + std::to_string(i)); + } + } + } + else + { + // non standard behavior because blens[1] can be not equalt to clens[1] + if(!(clens.size() == 3 && blens[0] == 1 && clens[0] == 1 && blens[2] == clens[2])) + { + MIOPEN_THROW( + "Non standard squashed operation supported only for 3d tensors and for " + "the specific configuration"); + } + } + } + + const miopenTensorOp_t GetTensorOp() const { return tensorOp; } + + const void* GetAlpha0() const { return alpha0; } + const void* GetAlpha1() const { return alpha1; } + const void* GetBeta() const { return beta; } + + const TensorDescriptor& GetATensorDesc() const { return aTensorDesc; } + const TensorDescriptor& GetBTensorDesc() const { return bTensorDesc; } + const TensorDescriptor& GetCTensorDesc() const { return cTensorDesc; } + + const bool GetNonStandardSquash() const { return nonStandardSquash; } + + NetworkConfig MakeNetworkConfig() const override; + +private: + void CheckAndAssignAlphaBeta(const void* alpha0_, const void* alpha1_, const void* beta_) + { + if(alpha0_ == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Alpha0 value is nullptr"); + } + if(alpha1_ == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Alpha1 value is nullptr"); + } + if(beta_ == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr"); + } + + alpha0 = alpha0_; + alpha1 = alpha1_; + beta = beta_; + } + + const miopenTensorOp_t tensorOp; + + const void* alpha0; + const void* alpha1; + const void* beta; + + const TensorDescriptor& aTensorDesc; + const TensorDescriptor& bTensorDesc; + const TensorDescriptor& cTensorDesc; + + const bool nonStandardSquash; +}; + +} // namespace tensor + +} // namespace miopen diff --git a/src/include/miopen/tensor/solvers.hpp b/src/include/miopen/tensor/solvers.hpp new file mode 100644 index 0000000000..072f708c84 --- /dev/null +++ b/src/include/miopen/tensor/solvers.hpp @@ -0,0 +1,62 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +#include + +namespace miopen { + +namespace solver { + +namespace tensor { + +using TensorOpSolver = NonTunableSolverBase; + +struct Op1dTensorGeneric final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const override; + + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + +} // namespace tensor + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/tensor_ops.hpp b/src/include/miopen/tensor_ops.hpp index 25d838598b..fa5be24048 100644 --- a/src/include/miopen/tensor_ops.hpp +++ b/src/include/miopen/tensor_ops.hpp @@ -189,6 +189,22 @@ MIOPEN_INTERNALS_EXPORT void OpTensor(const Handle& handle, size_t Coffset = 0, bool nonStandardSquash = false); +MIOPEN_INTERNALS_EXPORT void OpTensorNew(Handle& handle, + miopenTensorOp_t tensorOp, + const void* alpha0, + const TensorDescriptor& aTensorDesc, + ConstData_t ATensor, + const void* alpha1, + const TensorDescriptor& bTensorDesc, + ConstData_t BTensor, + const void* beta, + const TensorDescriptor& cTensorDesc, + Data_t CTensor, + size_t Aoffset = 0, + size_t Boffset = 0, + size_t Coffset = 0, + bool nonStandardSquash = false); + MIOPEN_INTERNALS_EXPORT void CopyTensor(const Handle& handle, const TensorDescriptor& srcDesc, ConstData_t src, diff --git a/src/solver/tensor/TensorOp1dGeneric.cpp b/src/solver/tensor/TensorOp1dGeneric.cpp new file mode 100644 index 0000000000..3eebb7b950 --- /dev/null +++ b/src/solver/tensor/TensorOp1dGeneric.cpp @@ -0,0 +1,194 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensor { + +bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const +{ + auto aTensorDesc = problem.GetATensorDesc(); + auto bTensorDesc = problem.GetBTensorDesc(); + auto alens = aTensorDesc.GetLengths(); + auto blens = bTensorDesc.GetLengths(); + auto asize = alens.size(); + + if(asize == 1) + { + return true; + } + if(asize == 2 && ((blens[0] == 1 && blens[1] == 1) || (blens[0] > 1 && blens[1] > 1))) + { + return true; + } + if(asize == 3 && ((blens[0] == 1 && blens[1] == 1 && blens[2] == 1) || + (blens[0] > 1 && blens[1] > 1 && blens[2] > 1))) + { + return true; + } + return false; +} + +std::size_t +Op1dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto aTensorDesc = problem.GetATensorDesc(); + auto bTensorDesc = problem.GetBTensorDesc(); + auto cTensorDesc = problem.GetCTensorDesc(); + + auto clens = cTensorDesc.GetLengths(); + + size_t local_threads = 256; + size_t max_num_wg = 4096; + + auto num_wg = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg)); + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + size_t global_threads = num_wg * local_threads; + + const std::vector vld{local_threads, 1, 1}; + const std::vector vgd{global_threads, 1, 1}; + + KernelBuildParameters build_params = + KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; + + // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp())); + + switch(problem.GetTensorOp()) + { + case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; + case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break; + case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break; + case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; + } + + if(aTensorDesc.AllDimsFitIntoInt()) + { + build_params.Define("DIM_TYPE", "uint32_t"); + } + else + { + build_params.Define("DIM_TYPE", "uint64_t"); + } + + build_params.Define("USE_1D_TENSOR_GENERIC"); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor( + kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{}); + kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; + kernel.kernel_name = "Op1dTensorGeneric"; + + for(uint32_t i = 0; i <= 2; i++) + { + kernel.l_wk.push_back(vld[i]); + kernel.g_wk.push_back(vgd[i]); + } + + result.invoker_factory = [=](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + auto blens = params.bTensorDesc.GetLengths(); + auto clens = params.cTensorDesc.GetLengths(); + + auto astrides = params.aTensorDesc.GetStrides(); + auto bstrides = params.bTensorDesc.GetStrides(); + auto cstrides = params.cTensorDesc.GetStrides(); + + if(aTensorDesc.AllDimsFitIntoInt()) + { // change offsets to 64bit after PR is merged + kernel(params.ATensor, + params.BTensor, + params.CTensor, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(astrides[0]), + static_cast(blens[0] == 1 ? 0 : bstrides[0]), + static_cast(cstrides[0]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(clens[0]), + !float_equal(miopen_beta, 0.0)); + } + else + { + kernel(params.ATensor, + params.BTensor, + params.CTensor, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(astrides[0]), + static_cast(blens[0] == 1 ? 0 : bstrides[0]), + static_cast(cstrides[0]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(clens[0]), + !float_equal(miopen_beta, 0.0)); + } + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensor + +} // namespace solver + +} // namespace miopen diff --git a/src/tensor.cpp b/src/tensor.cpp index 3e5190bc25..89400c9166 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -28,6 +28,10 @@ #include #include #include +#include +#include +#include +#include #include @@ -868,6 +872,55 @@ void from_json(const nlohmann::json& j, TensorDescriptor& descriptor) j.at("type").get_to(descriptor.type); } +void OpTensorNew(Handle& handle, + miopenTensorOp_t tensorOp, + const void* alpha0, + const TensorDescriptor& aTensorDesc, + ConstData_t ATensor, + const void* alpha1, + const TensorDescriptor& bTensorDesc, + ConstData_t BTensor, + const void* beta, + const TensorDescriptor& cTensorDesc, + Data_t CTensor, + const size_t Aoffset, + const size_t Boffset, + const size_t Coffset, + bool nonStandardSquash) +{ + if(ATensor == nullptr || BTensor == nullptr || CTensor == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + const auto problem = tensor::ProblemDescription{ + tensorOp, alpha0, alpha1, beta, aTensorDesc, bTensorDesc, cTensorDesc, nonStandardSquash}; + + const auto invoke_params = tensor::InvokeParams{tensorOp, + alpha0, + aTensorDesc, + ATensor, + alpha1, + bTensorDesc, + BTensor, + beta, + cTensorDesc, + CTensor, + Aoffset, + Boffset, + Coffset, + nonStandardSquash}; + + const auto tensor_dim = aTensorDesc.GetLengths().size(); + + if(tensor_dim == 1) + { + const auto algo = AlgorithmName{"Op1dTensorGeneric"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + } +} + } // namespace miopen int miopenGetTensorIndex(miopenTensorDescriptor_t tensorDesc, std::initializer_list indices) diff --git a/src/tensor/problem_description.cpp b/src/tensor/problem_description.cpp new file mode 100644 index 0000000000..562efd6389 --- /dev/null +++ b/src/tensor/problem_description.cpp @@ -0,0 +1,67 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +namespace miopen { + +namespace tensor { + +NetworkConfig ProblemDescription::MakeNetworkConfig() const +{ + std::ostringstream ss; + const auto tensor_dim = aTensorDesc.GetLengths().size(); + + ss << std::to_string(bTensorDesc.GetType()) << "-" << std::to_string(aTensorDesc.GetType()) + << "-" << std::to_string(tensorOp); + + if(tensor_dim == 1) + { + size_t local_threads = 256; + int max_num_wg = 4096; + int num_wg = + std::clamp(cTensorDesc.GetLengths()[0] / local_threads, size_t(1), size_t(max_num_wg)); + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + size_t global_threads = num_wg * local_threads; + ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads); + + if(aTensorDesc.AllDimsFitIntoInt()) + { + ss << "-32bit"; + } + else + { + ss << "-64bit"; + } + } + + return NetworkConfig{ss.str()}; +} + +} // namespace tensor + +} // namespace miopen diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp index 3121715e8a..152443e4fc 100644 --- a/test/tensor_ops.cpp +++ b/test/tensor_ops.cpp @@ -181,24 +181,24 @@ struct verify_tensor_ops auto a_dev = handle.Write(a.data); auto b_dev = handle.Write(b.data); - miopen::OpTensor(handle, - // miopenTensorOpAdd, - // miopenTensorOpMax, - // miopenTensorOpMin, - miopenTensorOpMul, - &alpha0, - a.desc, - a_dev.get(), - &alpha1, - b.desc, - b_dev.get(), - &beta, - c.desc, - c_dev.get(), - Aoffset, - Boffset, - Coffset, - false); // it does not verify non-standard behaviour + miopen::OpTensorNew(handle, + // miopenTensorOpAdd, + // miopenTensorOpMax, + // miopenTensorOpMin, + miopenTensorOpMul, + &alpha0, + a.desc, + a_dev.get(), + &alpha1, + b.desc, + b_dev.get(), + &beta, + c.desc, + c_dev.get(), + Aoffset, + Boffset, + Coffset, + false); // it does not verify non-standard behaviour if(not no_validate) { @@ -241,12 +241,12 @@ struct tensor_ops_driver : test_driver std::vector> get_sub_tensor_a() { - return {{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8}, {16, 8}, {8}}; + return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8}, {16, 8},*/ {8}}; } std::vector> get_sub_tensor_b() { - return {{32, 16, 8, 4, 4}, + return {/*{32, 16, 8, 4, 4}, {32, 16, 1, 1, 1}, {1, 16, 8, 1, 1}, {1, 1, 8, 4, 1}, @@ -266,7 +266,7 @@ struct tensor_ops_driver : test_driver {20, 1, 1}, {16, 8}, {16, 1}, - {1, 8}, + {1, 8},*/ {8}, {1}}; } From 75cecb268320a86ab2298e101d4c41c0cd68d788 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Wed, 30 Oct 2024 16:19:09 +0200 Subject: [PATCH 02/25] 1d solver file name change --- .../tensor/{TensorOp1dGeneric.cpp => Op1dTensorGeneric.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/solver/tensor/{TensorOp1dGeneric.cpp => Op1dTensorGeneric.cpp} (100%) diff --git a/src/solver/tensor/TensorOp1dGeneric.cpp b/src/solver/tensor/Op1dTensorGeneric.cpp similarity index 100% rename from src/solver/tensor/TensorOp1dGeneric.cpp rename to src/solver/tensor/Op1dTensorGeneric.cpp From 035989c0a2dd83a82a9e0cada7454b70b66ad680 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Wed, 30 Oct 2024 16:20:29 +0200 Subject: [PATCH 03/25] solver name change in cmakelists.txt --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4a92727cfd..6726430535 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -337,7 +337,7 @@ set( MIOpen_Source solver/softmarginloss/forward_softmarginloss.cpp solver/softmax/attn_softmax.cpp solver/softmax/softmax.cpp - solver/tensor/TensorOp1dGeneric.cpp + solver/tensor/Op1dTensorGeneric.cpp subbuffers.cpp t5layernorm_api.cpp target_properties.cpp From cf91070de6b0d58b310e34f38cddf4145d2f50b2 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Thu, 31 Oct 2024 15:53:30 +0200 Subject: [PATCH 04/25] more changes, 2d generic and 2d lite kernel --- src/CMakeLists.txt | 2 + src/include/miopen/tensor/solvers.hpp | 32 ++++ src/solver/tensor/Op1dTensorGeneric.cpp | 19 ++- src/solver/tensor/Op2dTensorGeneric.cpp | 173 +++++++++++++++++++ src/solver/tensor/Op2dTensorLite.cpp | 217 ++++++++++++++++++++++++ src/tensor.cpp | 13 +- src/tensor/problem_description.cpp | 44 ++++- test/tensor_ops.cpp | 6 +- 8 files changed, 482 insertions(+), 24 deletions(-) create mode 100644 src/solver/tensor/Op2dTensorGeneric.cpp create mode 100644 src/solver/tensor/Op2dTensorLite.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6726430535..9dc938cae4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -338,6 +338,8 @@ set( MIOpen_Source solver/softmax/attn_softmax.cpp solver/softmax/softmax.cpp solver/tensor/Op1dTensorGeneric.cpp + solver/tensor/Op2dTensorGeneric.cpp + solver/tensor/Op2dTensorLite.cpp subbuffers.cpp t5layernorm_api.cpp target_properties.cpp diff --git a/src/include/miopen/tensor/solvers.hpp b/src/include/miopen/tensor/solvers.hpp index 072f708c84..d87d54d674 100644 --- a/src/include/miopen/tensor/solvers.hpp +++ b/src/include/miopen/tensor/solvers.hpp @@ -55,6 +55,38 @@ struct Op1dTensorGeneric final : TensorOpSolver bool MayNeedWorkspace() const override { return false; } }; +struct Op2dTensorGeneric final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const override; + + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + +struct Op2dTensorLite final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const override; + + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + } // namespace tensor } // namespace solver diff --git a/src/solver/tensor/Op1dTensorGeneric.cpp b/src/solver/tensor/Op1dTensorGeneric.cpp index 3eebb7b950..e550ec7952 100644 --- a/src/solver/tensor/Op1dTensorGeneric.cpp +++ b/src/solver/tensor/Op1dTensorGeneric.cpp @@ -51,15 +51,16 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, { return true; } - if(asize == 2 && ((blens[0] == 1 && blens[1] == 1) || (blens[0] > 1 && blens[1] > 1))) - { - return true; - } - if(asize == 3 && ((blens[0] == 1 && blens[1] == 1 && blens[2] == 1) || - (blens[0] > 1 && blens[1] > 1 && blens[2] > 1))) - { - return true; - } + // add support for this later + // if(asize == 2 && ((blens[0] == 1 && blens[1] == 1) || (blens[0] > 1 && blens[1] > 1))) + // { + // return true; + // } + // if(asize == 3 && ((blens[0] == 1 && blens[1] == 1 && blens[2] == 1) || + // (blens[0] > 1 && blens[1] > 1 && blens[2] > 1))) + // { + // return true; + // } return false; } diff --git a/src/solver/tensor/Op2dTensorGeneric.cpp b/src/solver/tensor/Op2dTensorGeneric.cpp new file mode 100644 index 0000000000..6155117ba9 --- /dev/null +++ b/src/solver/tensor/Op2dTensorGeneric.cpp @@ -0,0 +1,173 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensor { + +bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const +{ + auto aTensorDesc = problem.GetATensorDesc(); + auto bTensorDesc = problem.GetBTensorDesc(); + auto alens = aTensorDesc.GetLengths(); + auto blens = bTensorDesc.GetLengths(); + auto asize = alens.size(); + + if(asize == 2) + { + return true; + } + // add applicable when asize == 3 and some special cases for b dimensions + + return false; +} + +std::size_t +Op2dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto aTensorDesc = problem.GetATensorDesc(); + auto bTensorDesc = problem.GetBTensorDesc(); + auto cTensorDesc = problem.GetCTensorDesc(); + + auto clens = cTensorDesc.GetLengths(); + + size_t local_threads = 32; + size_t max_num_wg = 4096; + + auto num_wg = std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg)); + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + size_t global_threads = num_wg * local_threads; + + const std::vector vld{local_threads, 1, 1}; + const std::vector vgd{global_threads, 1, 1}; + + KernelBuildParameters build_params = + KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; + + // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp())); + + switch(problem.GetTensorOp()) + { + case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; + case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break; + case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break; + case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; + } + + // support for 64bit still not merged + // if(aTensorDesc.AllDimsFitIntoInt()) + // { + // build_params.Define("DIM_TYPE", "uint32_t"); + // } + // else + // { + // build_params.Define("DIM_TYPE", "uint64_t"); + // } + + build_params.Define("USE_2D_TENSOR_GENERIC"); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor( + kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{}); + kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; + kernel.kernel_name = "Op2dTensorGeneric"; + + for(uint32_t i = 0; i <= 2; i++) + { + kernel.l_wk.push_back(vld[i]); + kernel.g_wk.push_back(vgd[i]); + } + + result.invoker_factory = [=](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + auto blens = params.bTensorDesc.GetLengths(); + auto clens = params.cTensorDesc.GetLengths(); + + auto astrides = params.aTensorDesc.GetStrides(); + auto bstrides = params.bTensorDesc.GetStrides(); + auto cstrides = params.cTensorDesc.GetStrides(); + + kernel(params.ATensor, + params.BTensor, + params.CTensor, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(blens[1] == 1 ? clens[1] : blens[1]), + static_cast(clens[1]), + static_cast(astrides[0]), + static_cast(astrides[1]), + static_cast(blens[0] == 1 ? 0 : bstrides[0]), + static_cast(blens[1] == 1 ? 0 : bstrides[1]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(clens[0]), + !float_equal(miopen_beta, 0.0)); + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensor + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/tensor/Op2dTensorLite.cpp b/src/solver/tensor/Op2dTensorLite.cpp new file mode 100644 index 0000000000..712a32d49d --- /dev/null +++ b/src/solver/tensor/Op2dTensorLite.cpp @@ -0,0 +1,217 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensor { + +bool Op2dTensorLite::IsApplicable(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const +{ + auto aTensorDesc = problem.GetATensorDesc(); + auto bTensorDesc = problem.GetBTensorDesc(); + auto cTensorDesc = problem.GetCTensorDesc(); + + auto alens = aTensorDesc.GetLengths(); + auto blens = bTensorDesc.GetLengths(); + auto clens = cTensorDesc.GetLengths(); + + auto asize = alens.size(); + + size_t local_threads = 256; + int max_num_wg = 4096; + + // for naive tensor ops + size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; + size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); + size_t grp_sz = (total_work + local_threads - 1) / local_threads; + + // opencl kernels are no longer supported, fallback to generic case + bool lite_applicable = grp_sz <= size_t(max_num_wg); + + bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 && + (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2]; + + if(asize == 3 && lite_applicable && is_lite) + { + return true; + } + + return false; +} + +std::size_t +Op2dTensorLite::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, + const miopen::tensor::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto aTensorDesc = problem.GetATensorDesc(); + auto bTensorDesc = problem.GetBTensorDesc(); + auto cTensorDesc = problem.GetCTensorDesc(); + + auto alens = aTensorDesc.GetLengths(); + auto blens = bTensorDesc.GetLengths(); + auto clens = cTensorDesc.GetLengths(); + + auto astrides = aTensorDesc.GetStrides(); + auto bstrides = bTensorDesc.GetStrides(); + auto cstrides = cTensorDesc.GetStrides(); + + // first_not_one is incorrect if btensor size equal to 1 + auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); + auto d = std::distance(blens.begin(), first_not_one.base()); + + // quick fix + int num_wg = first_not_one != blens.rend() + ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) + : 1; + + for(int i = (d - 2); i >= 0; i--) + { + if(blens[i] != 1) + { + num_wg *= blens[i]; + } + } + int max_num_wg = 4096; + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + + size_t local_threads = 256; + + // for naive tensor ops + size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; + const std::string data_type = GetDataType(bTensorDesc.GetType()); + const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); + + size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); + size_t grp_sz = (total_work + local_threads - 1) / local_threads; + + grp_sz = std::min(size_t(max_num_wg), grp_sz); + size_t glb_sz = local_threads * grp_sz; + + size_t local_threads2 = 64; + size_t total_work2 = clens[1]; + size_t grp_sz2 = (total_work2 + local_threads2 - 1) / local_threads2; + grp_sz2 = std::min(size_t(max_num_wg / grp_sz), grp_sz2); + size_t glb_sz2 = local_threads2 * grp_sz2; + + const std::vector vld{local_threads, 1, 1}; + const std::vector vgd{glb_sz, glb_sz2, 1}; + + KernelBuildParameters build_params = + KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; + + // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp())); + + switch(problem.GetTensorOp()) + { + case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; + case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break; + case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break; + case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; + } + + // support for 64bit still not merged + // if(aTensorDesc.AllDimsFitIntoInt()) + // { + // build_params.Define("DIM_TYPE", "uint32_t"); + // } + // else + // { + // build_params.Define("DIM_TYPE", "uint64_t"); + // } + + build_params.Define("USE_2D_TENSOR_LITE"); + build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); + build_params.Define("READ_TYPE", READ_TYPE); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor( + kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + kernel.kernel_name = "Op2dTensorLite"; + + for(uint32_t i = 0; i <= 2; i++) + { + kernel.l_wk.push_back(vld[i]); + kernel.g_wk.push_back(vgd[i]); + } + + result.invoker_factory = [=](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + kernel(params.ATensor, + static_cast(astrides[1]), + params.BTensor, + static_cast(bstrides[1]), + params.CTensor, + static_cast(cstrides[1]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(!float_equal(miopen_beta, 0.0)), + static_cast(blens[1] == 1)); + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensor + +} // namespace solver + +} // namespace miopen diff --git a/src/tensor.cpp b/src/tensor.cpp index 89400c9166..8280201c41 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -911,14 +911,11 @@ void OpTensorNew(Handle& handle, Coffset, nonStandardSquash}; - const auto tensor_dim = aTensorDesc.GetLengths().size(); - - if(tensor_dim == 1) - { - const auto algo = AlgorithmName{"Op1dTensorGeneric"}; - const auto solvers = solver::SolverContainer{}; - solvers.ExecutePrimitive(handle, problem, algo, invoke_params); - } + const auto algo = AlgorithmName{"TensorOpSolver"}; + const auto solvers = solver::SolverContainer{} + + solver::SolverContainer{} + + solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } } // namespace miopen diff --git a/src/tensor/problem_description.cpp b/src/tensor/problem_description.cpp index 562efd6389..a3460ccff7 100644 --- a/src/tensor/problem_description.cpp +++ b/src/tensor/problem_description.cpp @@ -36,15 +36,19 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const std::ostringstream ss; const auto tensor_dim = aTensorDesc.GetLengths().size(); + auto alens = aTensorDesc.GetLengths(); + auto blens = bTensorDesc.GetLengths(); + auto clens = cTensorDesc.GetLengths(); + + size_t local_threads = 256; + int max_num_wg = 4096; + ss << std::to_string(bTensorDesc.GetType()) << "-" << std::to_string(aTensorDesc.GetType()) << "-" << std::to_string(tensorOp); if(tensor_dim == 1) { - size_t local_threads = 256; - int max_num_wg = 4096; - int num_wg = - std::clamp(cTensorDesc.GetLengths()[0] / local_threads, size_t(1), size_t(max_num_wg)); + int num_wg = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg)); num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; size_t global_threads = num_wg * local_threads; ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads); @@ -58,6 +62,38 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const ss << "-64bit"; } } + else if(tensor_dim == 2) + { + local_threads = 32; + int num_wg = + std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg)); + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + size_t global_threads = num_wg * local_threads; + ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads); + } + else if(tensor_dim == 3) + { + + size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; + size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); + size_t grp_sz = (total_work + local_threads - 1) / local_threads; + size_t local_threads2 = 64; + size_t total_work2 = clens[1]; + size_t grp_sz2 = (total_work2 + local_threads2 - 1) / local_threads2; + grp_sz2 = std::min(size_t(max_num_wg / grp_sz), grp_sz2); + + bool lite_applicable = grp_sz <= size_t(max_num_wg); + + bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 && + (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2]; + + if(lite_applicable && is_lite) + { + ss << "-" << std::to_string(RD_BLCK) << "x" << std::to_string(local_threads) << "x" + << std::to_string(grp_sz) << std::to_string(local_threads2) + << std::to_string(grp_sz2); + } + } return NetworkConfig{ss.str()}; } diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp index 152443e4fc..b18c0276fa 100644 --- a/test/tensor_ops.cpp +++ b/test/tensor_ops.cpp @@ -241,7 +241,7 @@ struct tensor_ops_driver : test_driver std::vector> get_sub_tensor_a() { - return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8}, {16, 8},*/ {8}}; + return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8},*/ {16, 8}, {8}}; } std::vector> get_sub_tensor_b() @@ -263,10 +263,10 @@ struct tensor_ops_driver : test_driver {20, 16, 1}, {1, 16, 8}, {1, 16, 1}, - {20, 1, 1}, + {20, 1, 1},*/ {16, 8}, {16, 1}, - {1, 8},*/ + {1, 8}, {8}, {1}}; } From f2a11d648b1fd0fb1667535b64aae2e57d17b511 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Fri, 1 Nov 2024 18:32:08 +0200 Subject: [PATCH 05/25] some changes suggested in the comments --- src/CMakeLists.txt | 9 +- .../{tensor => tensorOp}/invoke_params.hpp | 10 +- .../problem_description.hpp | 42 +--- .../miopen/{tensor => tensorOp}/solvers.hpp | 54 +++-- src/include/miopen/tensor_ops.hpp | 30 +-- .../Op1dTensorGeneric.cpp | 69 +++--- .../Op2dTensorGeneric.cpp | 23 +- .../{tensor => tensorOp}/Op2dTensorLite.cpp | 42 ++-- src/solver/tensorOp/Op2dTensorSquash.cpp | 197 ++++++++++++++++++ src/tensor.cpp | 85 ++++---- src/tensor/problem_description.cpp | 103 --------- src/tensorOp/problem_description.cpp | 78 +++++++ test/tensor_ops.cpp | 36 ++-- 13 files changed, 477 insertions(+), 301 deletions(-) rename src/include/miopen/{tensor => tensorOp}/invoke_params.hpp (94%) rename src/include/miopen/{tensor => tensorOp}/problem_description.hpp (82%) rename src/include/miopen/{tensor => tensorOp}/solvers.hpp (56%) rename src/solver/{tensor => tensorOp}/Op1dTensorGeneric.cpp (76%) rename src/solver/{tensor => tensorOp}/Op2dTensorGeneric.cpp (90%) rename src/solver/{tensor => tensorOp}/Op2dTensorLite.cpp (87%) create mode 100644 src/solver/tensorOp/Op2dTensorSquash.cpp delete mode 100644 src/tensor/problem_description.cpp create mode 100644 src/tensorOp/problem_description.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9dc938cae4..56b99b79cf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -337,15 +337,16 @@ set( MIOpen_Source solver/softmarginloss/forward_softmarginloss.cpp solver/softmax/attn_softmax.cpp solver/softmax/softmax.cpp - solver/tensor/Op1dTensorGeneric.cpp - solver/tensor/Op2dTensorGeneric.cpp - solver/tensor/Op2dTensorLite.cpp + solver/tensorOp/Op1dTensorGeneric.cpp + solver/tensorOp/Op2dTensorGeneric.cpp + solver/tensorOp/Op2dTensorLite.cpp + solver/tensorOp/Op2dTensorSquash.cpp subbuffers.cpp t5layernorm_api.cpp target_properties.cpp temp_file.cpp tensor.cpp - tensor/problem_description.cpp + tensorOp/problem_description.cpp tensor_api.cpp transformers_adam_w_api.cpp seq_tensor.cpp diff --git a/src/include/miopen/tensor/invoke_params.hpp b/src/include/miopen/tensorOp/invoke_params.hpp similarity index 94% rename from src/include/miopen/tensor/invoke_params.hpp rename to src/include/miopen/tensorOp/invoke_params.hpp index 68c96fda3f..99ff13da47 100644 --- a/src/include/miopen/tensor/invoke_params.hpp +++ b/src/include/miopen/tensorOp/invoke_params.hpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2023 Advanced Micro Devices, Inc. + * Copyright (c) 2024 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,7 +31,7 @@ namespace miopen { -namespace tensor { +namespace tensorOp { struct InvokeParams : public miopen::InvokeParams { @@ -52,7 +52,7 @@ struct InvokeParams : public miopen::InvokeParams : alpha0(alpha0_), alpha1(alpha1_), beta(beta_), - tensorOp(tensorOp_), + tensorOperation(tensorOp_), aTensorDesc(aTensorDesc_), ATensor(ATensor_), bTensorDesc(bTensorDesc_), @@ -74,7 +74,7 @@ struct InvokeParams : public miopen::InvokeParams const void* alpha1; const void* beta; - miopenTensorOp_t tensorOp; + miopenTensorOp_t tensorOperation; TensorDescriptor aTensorDesc; ConstData_t ATensor; @@ -92,6 +92,6 @@ struct InvokeParams : public miopen::InvokeParams bool nonStandardSquash; }; -} // namespace tensor +} // namespace tensorOp } // namespace miopen diff --git a/src/include/miopen/tensor/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp similarity index 82% rename from src/include/miopen/tensor/problem_description.hpp rename to src/include/miopen/tensorOp/problem_description.hpp index 41a9e4d848..81621cfcbe 100644 --- a/src/include/miopen/tensor/problem_description.hpp +++ b/src/include/miopen/tensorOp/problem_description.hpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2023 Advanced Micro Devices, Inc. + * Copyright (c) 2024 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,13 +33,11 @@ namespace miopen { struct NetworkConfig; -namespace tensor { +namespace tensorOp { struct ProblemDescription : ProblemDescriptionBase { ProblemDescription(const miopenTensorOp_t tensorOp_, - const void* alpha0_, - const void* alpha1_, const void* beta_, const TensorDescriptor& aTensorDesc_, const TensorDescriptor& bTensorDesc_, @@ -51,7 +49,11 @@ struct ProblemDescription : ProblemDescriptionBase cTensorDesc(cTensorDesc_), nonStandardSquash(nonStandardSquash_) { - CheckAndAssignAlphaBeta(alpha0_, alpha1_, beta_); + if(beta_ == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr"); + } + beta = *(static_cast(beta_)); if(aTensorDesc.GetElementSize() != cTensorDesc.GetElementSize()) { @@ -101,9 +103,7 @@ struct ProblemDescription : ProblemDescriptionBase const miopenTensorOp_t GetTensorOp() const { return tensorOp; } - const void* GetAlpha0() const { return alpha0; } - const void* GetAlpha1() const { return alpha1; } - const void* GetBeta() const { return beta; } + float GetBeta() const { return beta; } const TensorDescriptor& GetATensorDesc() const { return aTensorDesc; } const TensorDescriptor& GetBTensorDesc() const { return bTensorDesc; } @@ -114,31 +114,9 @@ struct ProblemDescription : ProblemDescriptionBase NetworkConfig MakeNetworkConfig() const override; private: - void CheckAndAssignAlphaBeta(const void* alpha0_, const void* alpha1_, const void* beta_) - { - if(alpha0_ == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm, "Alpha0 value is nullptr"); - } - if(alpha1_ == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm, "Alpha1 value is nullptr"); - } - if(beta_ == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr"); - } - - alpha0 = alpha0_; - alpha1 = alpha1_; - beta = beta_; - } - const miopenTensorOp_t tensorOp; - const void* alpha0; - const void* alpha1; - const void* beta; + float beta; const TensorDescriptor& aTensorDesc; const TensorDescriptor& bTensorDesc; @@ -147,6 +125,6 @@ struct ProblemDescription : ProblemDescriptionBase const bool nonStandardSquash; }; -} // namespace tensor +} // namespace tensorOp } // namespace miopen diff --git a/src/include/miopen/tensor/solvers.hpp b/src/include/miopen/tensorOp/solvers.hpp similarity index 56% rename from src/include/miopen/tensor/solvers.hpp rename to src/include/miopen/tensorOp/solvers.hpp index d87d54d674..290a8b2cd9 100644 --- a/src/include/miopen/tensor/solvers.hpp +++ b/src/include/miopen/tensorOp/solvers.hpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2023 Advanced Micro Devices, Inc. + * Copyright (c) 2024 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ #pragma once #include -#include +#include #include @@ -35,22 +35,23 @@ namespace miopen { namespace solver { -namespace tensor { +namespace tensorOp { -using TensorOpSolver = NonTunableSolverBase; +using TensorOpSolver = NonTunableSolverBase; struct Op1dTensorGeneric final : TensorOpSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } bool IsApplicable(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const override; + const miopen::tensorOp::ProblemDescription& problem) const override; ConvSolution GetSolution(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const override; + const miopen::tensorOp::ProblemDescription& problem) const override; - std::size_t GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; bool MayNeedWorkspace() const override { return false; } }; @@ -60,13 +61,14 @@ struct Op2dTensorGeneric final : TensorOpSolver const std::string& SolverDbId() const override { return GetSolverDbId(); } bool IsApplicable(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const override; + const miopen::tensorOp::ProblemDescription& problem) const override; ConvSolution GetSolution(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const override; + const miopen::tensorOp::ProblemDescription& problem) const override; - std::size_t GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; bool MayNeedWorkspace() const override { return false; } }; @@ -76,18 +78,36 @@ struct Op2dTensorLite final : TensorOpSolver const std::string& SolverDbId() const override { return GetSolverDbId(); } bool IsApplicable(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const override; + const miopen::tensorOp::ProblemDescription& problem) const override; ConvSolution GetSolution(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const override; + const miopen::tensorOp::ProblemDescription& problem) const override; - std::size_t GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; bool MayNeedWorkspace() const override { return false; } }; -} // namespace tensor +struct Op2dTensorSquash final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + +} // namespace tensorOp } // namespace solver diff --git a/src/include/miopen/tensor_ops.hpp b/src/include/miopen/tensor_ops.hpp index fa5be24048..c19eb333f2 100644 --- a/src/include/miopen/tensor_ops.hpp +++ b/src/include/miopen/tensor_ops.hpp @@ -189,21 +189,21 @@ MIOPEN_INTERNALS_EXPORT void OpTensor(const Handle& handle, size_t Coffset = 0, bool nonStandardSquash = false); -MIOPEN_INTERNALS_EXPORT void OpTensorNew(Handle& handle, - miopenTensorOp_t tensorOp, - const void* alpha0, - const TensorDescriptor& aTensorDesc, - ConstData_t ATensor, - const void* alpha1, - const TensorDescriptor& bTensorDesc, - ConstData_t BTensor, - const void* beta, - const TensorDescriptor& cTensorDesc, - Data_t CTensor, - size_t Aoffset = 0, - size_t Boffset = 0, - size_t Coffset = 0, - bool nonStandardSquash = false); +MIOPEN_INTERNALS_EXPORT void OpTensor2(Handle& handle, + miopenTensorOp_t tensorOp, + const void* alpha0, + const TensorDescriptor& aTensorDesc, + ConstData_t ATensor, + const void* alpha1, + const TensorDescriptor& bTensorDesc, + ConstData_t BTensor, + const void* beta, + const TensorDescriptor& cTensorDesc, + Data_t CTensor, + size_t Aoffset = 0, + size_t Boffset = 0, + size_t Coffset = 0, + bool nonStandardSquash = false); MIOPEN_INTERNALS_EXPORT void CopyTensor(const Handle& handle, const TensorDescriptor& srcDesc, diff --git a/src/solver/tensor/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp similarity index 76% rename from src/solver/tensor/Op1dTensorGeneric.cpp rename to src/solver/tensorOp/Op1dTensorGeneric.cpp index e550ec7952..54bb19e646 100644 --- a/src/solver/tensor/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2023 Advanced Micro Devices, Inc. + * Copyright (c) 2024 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -24,9 +24,9 @@ * *******************************************************************************/ -#include +#include -#include +#include #include #include #include @@ -36,10 +36,10 @@ namespace miopen { namespace solver { -namespace tensor { +namespace tensorOp { bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const + const miopen::tensorOp::ProblemDescription& problem) const { auto aTensorDesc = problem.GetATensorDesc(); auto bTensorDesc = problem.GetBTensorDesc(); @@ -47,6 +47,11 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, auto blens = bTensorDesc.GetLengths(); auto asize = alens.size(); + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + if(asize == 1) { return true; @@ -66,21 +71,22 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, std::size_t Op1dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const + const miopen::tensorOp::ProblemDescription& problem) const { return 0; } -ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const +ConvSolution +Op1dTensorGeneric::GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; - auto aTensorDesc = problem.GetATensorDesc(); - auto bTensorDesc = problem.GetBTensorDesc(); - auto cTensorDesc = problem.GetCTensorDesc(); + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); - auto clens = cTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); size_t local_threads = 256; size_t max_num_wg = 4096; @@ -89,14 +95,12 @@ ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context, num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; size_t global_threads = num_wg * local_threads; - const std::vector vld{local_threads, 1, 1}; - const std::vector vgd{global_threads, 1, 1}; + const std::array vld{local_threads, 1, 1}; + const std::array vgd{global_threads, 1, 1}; KernelBuildParameters build_params = KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; - // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp())); - switch(problem.GetTensorOp()) { case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; @@ -118,35 +122,34 @@ ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context, auto kernel = KernelInfo{}; - kernel.comp_options = build_params.GenerateFor( - kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{}); + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); // + GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{}); kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; kernel.kernel_name = "Op1dTensorGeneric"; - for(uint32_t i = 0; i <= 2; i++) - { - kernel.l_wk.push_back(vld[i]); - kernel.g_wk.push_back(vgd[i]); - } + using std::begin, std::end; + + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [=](const std::vector kernels) { + result.invoker_factory = [](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); + decltype(auto) params = raw_params.CastTo(); - visit_float(bTensorDesc.GetType(), [&](auto as_float) { + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - auto blens = params.bTensorDesc.GetLengths(); - auto clens = params.cTensorDesc.GetLengths(); + const auto& blens = params.bTensorDesc.GetLengths(); + const auto& clens = params.cTensorDesc.GetLengths(); - auto astrides = params.aTensorDesc.GetStrides(); - auto bstrides = params.bTensorDesc.GetStrides(); - auto cstrides = params.cTensorDesc.GetStrides(); + const auto& astrides = params.aTensorDesc.GetStrides(); + const auto& bstrides = params.bTensorDesc.GetStrides(); + const auto& cstrides = params.cTensorDesc.GetStrides(); - if(aTensorDesc.AllDimsFitIntoInt()) + if(params.aTensorDesc.AllDimsFitIntoInt()) { // change offsets to 64bit after PR is merged kernel(params.ATensor, params.BTensor, @@ -188,7 +191,7 @@ ConvSolution Op1dTensorGeneric::GetSolution(const ExecutionContext& context, return result; } -} // namespace tensor +} // namespace tensorOp } // namespace solver diff --git a/src/solver/tensor/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp similarity index 90% rename from src/solver/tensor/Op2dTensorGeneric.cpp rename to src/solver/tensorOp/Op2dTensorGeneric.cpp index 6155117ba9..640c3be115 100644 --- a/src/solver/tensor/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2023 Advanced Micro Devices, Inc. + * Copyright (c) 2024 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -24,9 +24,9 @@ * *******************************************************************************/ -#include +#include -#include +#include #include #include #include @@ -36,10 +36,10 @@ namespace miopen { namespace solver { -namespace tensor { +namespace tensorOp { bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const + const miopen::tensorOp::ProblemDescription& problem) const { auto aTensorDesc = problem.GetATensorDesc(); auto bTensorDesc = problem.GetBTensorDesc(); @@ -58,13 +58,14 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context, std::size_t Op2dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const + const miopen::tensorOp::ProblemDescription& problem) const { return 0; } -ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const +ConvSolution +Op2dTensorGeneric::GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; @@ -87,8 +88,6 @@ ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; - // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp())); - switch(problem.GetTensorOp()) { case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; @@ -125,7 +124,7 @@ ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context, result.invoker_factory = [=](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); + decltype(auto) params = raw_params.CastTo(); visit_float(bTensorDesc.GetType(), [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); @@ -166,7 +165,7 @@ ConvSolution Op2dTensorGeneric::GetSolution(const ExecutionContext& context, return result; } -} // namespace tensor +} // namespace tensorOp } // namespace solver diff --git a/src/solver/tensor/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp similarity index 87% rename from src/solver/tensor/Op2dTensorLite.cpp rename to src/solver/tensorOp/Op2dTensorLite.cpp index 712a32d49d..9c53a3e99e 100644 --- a/src/solver/tensor/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2023 Advanced Micro Devices, Inc. + * Copyright (c) 2024 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -24,9 +24,9 @@ * *******************************************************************************/ -#include +#include -#include +#include #include #include #include @@ -36,10 +36,10 @@ namespace miopen { namespace solver { -namespace tensor { +namespace tensorOp { bool Op2dTensorLite::IsApplicable(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const + const miopen::tensorOp::ProblemDescription& problem) const { auto aTensorDesc = problem.GetATensorDesc(); auto bTensorDesc = problem.GetBTensorDesc(); @@ -51,6 +51,11 @@ bool Op2dTensorLite::IsApplicable(const ExecutionContext& context, auto asize = alens.size(); + if(asize < 3) + { + return false; + } + size_t local_threads = 256; int max_num_wg = 4096; @@ -75,13 +80,13 @@ bool Op2dTensorLite::IsApplicable(const ExecutionContext& context, std::size_t Op2dTensorLite::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const + const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, - const miopen::tensor::ProblemDescription& problem) const + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; @@ -141,8 +146,6 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; - // build_params.Define("MIOPEN_TENSOR_OP", std::to_string(problem.GetTensorOp())); - switch(problem.GetTensorOp()) { case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; @@ -151,26 +154,15 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; } - // support for 64bit still not merged - // if(aTensorDesc.AllDimsFitIntoInt()) - // { - // build_params.Define("DIM_TYPE", "uint32_t"); - // } - // else - // { - // build_params.Define("DIM_TYPE", "uint64_t"); - // } - build_params.Define("USE_2D_TENSOR_LITE"); build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); build_params.Define("READ_TYPE", READ_TYPE); auto kernel = KernelInfo{}; - kernel.comp_options = build_params.GenerateFor( - kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{}); - kernel.kernel_file = "MIOpenTensorKernels.cl"; - kernel.kernel_name = "Op2dTensorLite"; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + kernel.kernel_name = "Op2dTensorLite"; for(uint32_t i = 0; i <= 2; i++) { @@ -181,7 +173,7 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, result.invoker_factory = [=](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); + decltype(auto) params = raw_params.CastTo(); visit_float(bTensorDesc.GetType(), [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); @@ -210,7 +202,7 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, return result; } -} // namespace tensor +} // namespace tensorOp } // namespace solver diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp new file mode 100644 index 0000000000..687123ed0a --- /dev/null +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -0,0 +1,197 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensorOp { + +bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + auto aTensorDesc = problem.GetATensorDesc(); + auto bTensorDesc = problem.GetBTensorDesc(); + auto cTensorDesc = problem.GetCTensorDesc(); + + auto alens = aTensorDesc.GetLengths(); + auto blens = bTensorDesc.GetLengths(); + auto clens = cTensorDesc.GetLengths(); + + auto asize = alens.size(); + + if(asize < 3) + { + return false; + } + + bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 && + (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2]; + + bool is_squashed = problem.GetNonStandardSquash() && !is_lite && + (blens[0] == 1 && clens[0] == 1 && clens[1] == 1 && blens[2] == clens[2]); + + if(asize == 3 && is_squashed) + { + return true; + } + + return false; +} + +std::size_t +Op2dTensorSquash::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution +Op2dTensorSquash::GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto aTensorDesc = problem.GetATensorDesc(); + auto bTensorDesc = problem.GetBTensorDesc(); + auto cTensorDesc = problem.GetCTensorDesc(); + + auto alens = aTensorDesc.GetLengths(); + auto blens = bTensorDesc.GetLengths(); + auto clens = cTensorDesc.GetLengths(); + + auto astrides = aTensorDesc.GetStrides(); + auto bstrides = bTensorDesc.GetStrides(); + auto cstrides = cTensorDesc.GetStrides(); + + // first_not_one is incorrect if btensor size equal to 1 + auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); + auto d = std::distance(blens.begin(), first_not_one.base()); + + // quick fix + int num_wg = first_not_one != blens.rend() + ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) + : 1; + + for(int i = (d - 2); i >= 0; i--) + { + if(blens[i] != 1) + { + num_wg *= blens[i]; + } + } + int max_num_wg = 4096; + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + + size_t local_threads = 256; + + // for naive tensor ops + size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; + const std::string data_type = GetDataType(bTensorDesc.GetType()); + const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); + + size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); + size_t grp_sz = (total_work + local_threads - 1) / local_threads; + + grp_sz = std::min(size_t(max_num_wg), grp_sz); + size_t glb_sz = local_threads * grp_sz; + + const std::vector vld{local_threads, 1, 1}; + const std::vector vgd{glb_sz, 1, 1}; + + KernelBuildParameters build_params = + KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; + + switch(problem.GetTensorOp()) + { + case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; + case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break; + case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break; + case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; + } + + build_params.Define("USE_2D_TENSOR_SQUASH"); + build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); + build_params.Define("READ_TYPE", READ_TYPE); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + kernel.kernel_name = "Op2dTensorSquash"; + + for(uint32_t i = 0; i <= 2; i++) + { + kernel.l_wk.push_back(vld[i]); + kernel.g_wk.push_back(vgd[i]); + } + + result.invoker_factory = [=](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + kernel(params.ATensor, + params.BTensor, + static_cast(blens[1]), + static_cast(bstrides[1]), + params.CTensor, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(total_work), + static_cast(!float_equal(miopen_alpha0, 0.0)), + static_cast(!float_equal(miopen_alpha1, 0.0)), + static_cast(!float_equal(miopen_beta, 0.0))); + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensorOp + +} // namespace solver + +} // namespace miopen diff --git a/src/tensor.cpp b/src/tensor.cpp index 8280201c41..6c258c0e7d 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -29,8 +29,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -872,49 +872,60 @@ void from_json(const nlohmann::json& j, TensorDescriptor& descriptor) j.at("type").get_to(descriptor.type); } -void OpTensorNew(Handle& handle, - miopenTensorOp_t tensorOp, - const void* alpha0, - const TensorDescriptor& aTensorDesc, - ConstData_t ATensor, - const void* alpha1, - const TensorDescriptor& bTensorDesc, - ConstData_t BTensor, - const void* beta, - const TensorDescriptor& cTensorDesc, - Data_t CTensor, - const size_t Aoffset, - const size_t Boffset, - const size_t Coffset, - bool nonStandardSquash) +void OpTensor2(Handle& handle, + miopenTensorOp_t tensorOp, + const void* alpha0, + const TensorDescriptor& aTensorDesc, + ConstData_t ATensor, + const void* alpha1, + const TensorDescriptor& bTensorDesc, + ConstData_t BTensor, + const void* beta, + const TensorDescriptor& cTensorDesc, + Data_t CTensor, + const size_t Aoffset, + const size_t Boffset, + const size_t Coffset, + bool nonStandardSquash) { if(ATensor == nullptr || BTensor == nullptr || CTensor == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } - const auto problem = tensor::ProblemDescription{ - tensorOp, alpha0, alpha1, beta, aTensorDesc, bTensorDesc, cTensorDesc, nonStandardSquash}; - - const auto invoke_params = tensor::InvokeParams{tensorOp, - alpha0, - aTensorDesc, - ATensor, - alpha1, - bTensorDesc, - BTensor, - beta, - cTensorDesc, - CTensor, - Aoffset, - Boffset, - Coffset, - nonStandardSquash}; + if(alpha0 == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Alpha0 value is nullptr"); + } + + if(alpha1 == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Alpha1 value is nullptr"); + } + + const auto problem = tensorOp::ProblemDescription{ + tensorOp, beta, aTensorDesc, bTensorDesc, cTensorDesc, nonStandardSquash}; + + const auto invoke_params = tensorOp::InvokeParams{tensorOp, + alpha0, + aTensorDesc, + ATensor, + alpha1, + bTensorDesc, + BTensor, + beta, + cTensorDesc, + CTensor, + Aoffset, + Boffset, + Coffset, + nonStandardSquash}; const auto algo = AlgorithmName{"TensorOpSolver"}; - const auto solvers = solver::SolverContainer{} + - solver::SolverContainer{} + - solver::SolverContainer{}; + const auto solvers = solver::SolverContainer{} + + solver::SolverContainer{} + + solver::SolverContainer{} + + solver::SolverContainer{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } diff --git a/src/tensor/problem_description.cpp b/src/tensor/problem_description.cpp deleted file mode 100644 index a3460ccff7..0000000000 --- a/src/tensor/problem_description.cpp +++ /dev/null @@ -1,103 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2023 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#include -#include - -namespace miopen { - -namespace tensor { - -NetworkConfig ProblemDescription::MakeNetworkConfig() const -{ - std::ostringstream ss; - const auto tensor_dim = aTensorDesc.GetLengths().size(); - - auto alens = aTensorDesc.GetLengths(); - auto blens = bTensorDesc.GetLengths(); - auto clens = cTensorDesc.GetLengths(); - - size_t local_threads = 256; - int max_num_wg = 4096; - - ss << std::to_string(bTensorDesc.GetType()) << "-" << std::to_string(aTensorDesc.GetType()) - << "-" << std::to_string(tensorOp); - - if(tensor_dim == 1) - { - int num_wg = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg)); - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; - size_t global_threads = num_wg * local_threads; - ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads); - - if(aTensorDesc.AllDimsFitIntoInt()) - { - ss << "-32bit"; - } - else - { - ss << "-64bit"; - } - } - else if(tensor_dim == 2) - { - local_threads = 32; - int num_wg = - std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg)); - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; - size_t global_threads = num_wg * local_threads; - ss << "-" << std::to_string(global_threads) << "-" << std::to_string(local_threads); - } - else if(tensor_dim == 3) - { - - size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; - size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); - size_t grp_sz = (total_work + local_threads - 1) / local_threads; - size_t local_threads2 = 64; - size_t total_work2 = clens[1]; - size_t grp_sz2 = (total_work2 + local_threads2 - 1) / local_threads2; - grp_sz2 = std::min(size_t(max_num_wg / grp_sz), grp_sz2); - - bool lite_applicable = grp_sz <= size_t(max_num_wg); - - bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 && - (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2]; - - if(lite_applicable && is_lite) - { - ss << "-" << std::to_string(RD_BLCK) << "x" << std::to_string(local_threads) << "x" - << std::to_string(grp_sz) << std::to_string(local_threads2) - << std::to_string(grp_sz2); - } - } - - return NetworkConfig{ss.str()}; -} - -} // namespace tensor - -} // namespace miopen diff --git a/src/tensorOp/problem_description.cpp b/src/tensorOp/problem_description.cpp new file mode 100644 index 0000000000..dc16276f05 --- /dev/null +++ b/src/tensorOp/problem_description.cpp @@ -0,0 +1,78 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +namespace miopen { + +namespace tensorOp { + +NetworkConfig ProblemDescription::MakeNetworkConfig() const +{ + std::ostringstream ss; + + auto alens = aTensorDesc.GetLengths(); + auto blens = bTensorDesc.GetLengths(); + + auto astrides = aTensorDesc.GetStrides(); + auto bstrides = bTensorDesc.GetStrides(); + auto cstrides = cTensorDesc.GetStrides(); + + std::string alens_str{}; + std::string blens_str{}; + std::string astrides_str{}; + std::string bstrides_str{}; + std::string cstrides_str{}; + + for(uint32_t i = 0; i < alens.size(); i++) + { + alens_str += std::to_string(alens[i]); + blens_str += std::to_string(blens[i]); + astrides_str += std::to_string(astrides[i]); + bstrides_str += std::to_string(bstrides[i]); + cstrides_str += std::to_string(cstrides[i]); + + if(i != (alens.size() - 1)) + { + alens_str += "x"; + blens_str += "x"; + astrides_str += "x"; + bstrides_str += "x"; + cstrides_str += "x"; + } + } + + ss << std::to_string(aTensorDesc.GetType()) << "-" << std::to_string(tensorOp) << "-" + << alens_str << "-" << blens_str << "-" << astrides_str << "-" << bstrides_str << "-" + << cstrides_str << "-" << std::to_string((beta == 0)); + + return NetworkConfig{ss.str()}; +} + +} // namespace tensorOp + +} // namespace miopen diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp index b18c0276fa..c93d1b1eb1 100644 --- a/test/tensor_ops.cpp +++ b/test/tensor_ops.cpp @@ -181,24 +181,24 @@ struct verify_tensor_ops auto a_dev = handle.Write(a.data); auto b_dev = handle.Write(b.data); - miopen::OpTensorNew(handle, - // miopenTensorOpAdd, - // miopenTensorOpMax, - // miopenTensorOpMin, - miopenTensorOpMul, - &alpha0, - a.desc, - a_dev.get(), - &alpha1, - b.desc, - b_dev.get(), - &beta, - c.desc, - c_dev.get(), - Aoffset, - Boffset, - Coffset, - false); // it does not verify non-standard behaviour + miopen::OpTensor2(handle, + // miopenTensorOpAdd, + // miopenTensorOpMax, + // miopenTensorOpMin, + miopenTensorOpMul, + &alpha0, + a.desc, + a_dev.get(), + &alpha1, + b.desc, + b_dev.get(), + &beta, + c.desc, + c_dev.get(), + Aoffset, + Boffset, + Coffset, + false); // it does not verify non-standard behaviour if(not no_validate) { From ac13ff30e6b20b415d1fcb4756b5bf87ed95ecf2 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Mon, 4 Nov 2024 17:07:48 +0200 Subject: [PATCH 06/25] additional changes --- src/CMakeLists.txt | 1 + src/include/miopen/tensorOp/solvers.hpp | 17 ++ src/solver/tensorOp/Op1dTensorGeneric.cpp | 35 +--- src/solver/tensorOp/Op2dTensorGeneric.cpp | 73 ++++----- src/solver/tensorOp/Op2dTensorLite.cpp | 77 +++++---- src/solver/tensorOp/Op2dTensorSquash.cpp | 73 ++++----- src/solver/tensorOp/Op3dTensorGeneric.cpp | 186 ++++++++++++++++++++++ src/solver/tensorOp/tensor_op_helpers.hpp | 77 +++++++++ src/tensor.cpp | 3 +- test/tensor_ops.cpp | 6 +- 10 files changed, 393 insertions(+), 155 deletions(-) create mode 100644 src/solver/tensorOp/Op3dTensorGeneric.cpp create mode 100644 src/solver/tensorOp/tensor_op_helpers.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 56b99b79cf..85912d3b6f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -341,6 +341,7 @@ set( MIOpen_Source solver/tensorOp/Op2dTensorGeneric.cpp solver/tensorOp/Op2dTensorLite.cpp solver/tensorOp/Op2dTensorSquash.cpp + solver/tensorOp/Op3dTensorGeneric.cpp subbuffers.cpp t5layernorm_api.cpp target_properties.cpp diff --git a/src/include/miopen/tensorOp/solvers.hpp b/src/include/miopen/tensorOp/solvers.hpp index 290a8b2cd9..9eb9e187b6 100644 --- a/src/include/miopen/tensorOp/solvers.hpp +++ b/src/include/miopen/tensorOp/solvers.hpp @@ -107,6 +107,23 @@ struct Op2dTensorSquash final : TensorOpSolver bool MayNeedWorkspace() const override { return false; } }; +struct Op3dTensorGeneric final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + } // namespace tensorOp } // namespace solver diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index 54bb19e646..640f9968e2 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -24,8 +24,8 @@ * *******************************************************************************/ +#include "tensor_op_helpers.hpp" #include - #include #include #include @@ -41,10 +41,10 @@ namespace tensorOp { bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { - auto aTensorDesc = problem.GetATensorDesc(); - auto bTensorDesc = problem.GetBTensorDesc(); - auto alens = aTensorDesc.GetLengths(); - auto blens = bTensorDesc.GetLengths(); + const auto& aTensorDesc = problem.GetATensorDesc(); + // const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& alens = aTensorDesc.GetLengths(); + // const auto& blens = bTensorDesc.GetLengths(); auto asize = alens.size(); if(GetDataType(aTensorDesc.GetType()) == "double") @@ -82,8 +82,6 @@ Op1dTensorGeneric::GetSolution(const ExecutionContext& context, { auto result = ConvSolution{miopenStatusSuccess}; - const auto& aTensorDesc = problem.GetATensorDesc(); - const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); const auto& clens = cTensorDesc.GetLengths(); @@ -98,32 +96,15 @@ Op1dTensorGeneric::GetSolution(const ExecutionContext& context, const std::array vld{local_threads, 1, 1}; const std::array vgd{global_threads, 1, 1}; - KernelBuildParameters build_params = - KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; - - switch(problem.GetTensorOp()) - { - case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; - case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break; - case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break; - case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; - } + KernelBuildParameters build_params = KernelBuildParameters{}; - if(aTensorDesc.AllDimsFitIntoInt()) - { - build_params.Define("DIM_TYPE", "uint32_t"); - } - else - { - build_params.Define("DIM_TYPE", "uint64_t"); - } + GetCommonParams(build_params, problem, true); build_params.Define("USE_1D_TENSOR_GENERIC"); auto kernel = KernelInfo{}; - kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); // - GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{}); + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; kernel.kernel_name = "Op1dTensorGeneric"; diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index 640c3be115..92a19d6a99 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -23,9 +23,8 @@ * SOFTWARE. * *******************************************************************************/ - +#include "tensor_op_helpers.hpp" #include - #include #include #include @@ -41,12 +40,17 @@ namespace tensorOp { bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { - auto aTensorDesc = problem.GetATensorDesc(); - auto bTensorDesc = problem.GetBTensorDesc(); - auto alens = aTensorDesc.GetLengths(); - auto blens = bTensorDesc.GetLengths(); + const auto& aTensorDesc = problem.GetATensorDesc(); + // const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& alens = aTensorDesc.GetLengths(); + // const auto& blens = bTensorDesc.GetLengths(); auto asize = alens.size(); + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + if(asize == 2) { return true; @@ -69,11 +73,9 @@ Op2dTensorGeneric::GetSolution(const ExecutionContext& context, { auto result = ConvSolution{miopenStatusSuccess}; - auto aTensorDesc = problem.GetATensorDesc(); - auto bTensorDesc = problem.GetBTensorDesc(); - auto cTensorDesc = problem.GetCTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); - auto clens = cTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); size_t local_threads = 32; size_t max_num_wg = 4096; @@ -82,61 +84,42 @@ Op2dTensorGeneric::GetSolution(const ExecutionContext& context, num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; size_t global_threads = num_wg * local_threads; - const std::vector vld{local_threads, 1, 1}; - const std::vector vgd{global_threads, 1, 1}; + const std::array vld{local_threads, 1, 1}; + const std::array vgd{global_threads, 1, 1}; - KernelBuildParameters build_params = - KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; + KernelBuildParameters build_params = KernelBuildParameters{}; - switch(problem.GetTensorOp()) - { - case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; - case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break; - case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break; - case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; - } - - // support for 64bit still not merged - // if(aTensorDesc.AllDimsFitIntoInt()) - // { - // build_params.Define("DIM_TYPE", "uint32_t"); - // } - // else - // { - // build_params.Define("DIM_TYPE", "uint64_t"); - // } + GetCommonParams(build_params, problem, false); build_params.Define("USE_2D_TENSOR_GENERIC"); auto kernel = KernelInfo{}; - kernel.comp_options = build_params.GenerateFor( - kbp::HIP{}); // GetDataTypeKBP(aTensorDesc.GetType()).GenerateFor(kbp::HIP{}); + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; kernel.kernel_name = "Op2dTensorGeneric"; - for(uint32_t i = 0; i <= 2; i++) - { - kernel.l_wk.push_back(vld[i]); - kernel.g_wk.push_back(vgd[i]); - } + using std::begin, std::end; + + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [=](const std::vector kernels) { + result.invoker_factory = [](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(bTensorDesc.GetType(), [&](auto as_float) { + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - auto blens = params.bTensorDesc.GetLengths(); - auto clens = params.cTensorDesc.GetLengths(); + const auto& blens = params.bTensorDesc.GetLengths(); + const auto& clens = params.cTensorDesc.GetLengths(); - auto astrides = params.aTensorDesc.GetStrides(); - auto bstrides = params.bTensorDesc.GetStrides(); - auto cstrides = params.cTensorDesc.GetStrides(); + const auto& astrides = params.aTensorDesc.GetStrides(); + const auto& bstrides = params.bTensorDesc.GetStrides(); + const auto& cstrides = params.cTensorDesc.GetStrides(); kernel(params.ATensor, params.BTensor, diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp index 9c53a3e99e..8782fe1b29 100644 --- a/src/solver/tensorOp/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -23,9 +23,9 @@ * SOFTWARE. * *******************************************************************************/ +#include "tensor_op_helpers.hpp" #include - #include #include #include @@ -41,16 +41,21 @@ namespace tensorOp { bool Op2dTensorLite::IsApplicable(const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { - auto aTensorDesc = problem.GetATensorDesc(); - auto bTensorDesc = problem.GetBTensorDesc(); - auto cTensorDesc = problem.GetCTensorDesc(); + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); - auto alens = aTensorDesc.GetLengths(); - auto blens = bTensorDesc.GetLengths(); - auto clens = cTensorDesc.GetLengths(); + const auto& alens = aTensorDesc.GetLengths(); + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); auto asize = alens.size(); + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + if(asize < 3) { return false; @@ -90,17 +95,11 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, { auto result = ConvSolution{miopenStatusSuccess}; - auto aTensorDesc = problem.GetATensorDesc(); - auto bTensorDesc = problem.GetBTensorDesc(); - auto cTensorDesc = problem.GetCTensorDesc(); - - auto alens = aTensorDesc.GetLengths(); - auto blens = bTensorDesc.GetLengths(); - auto clens = cTensorDesc.GetLengths(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); - auto astrides = aTensorDesc.GetStrides(); - auto bstrides = bTensorDesc.GetStrides(); - auto cstrides = cTensorDesc.GetStrides(); + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); // first_not_one is incorrect if btensor size equal to 1 auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); @@ -124,9 +123,9 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, size_t local_threads = 256; // for naive tensor ops - size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; - const std::string data_type = GetDataType(bTensorDesc.GetType()); - const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); + size_t RD_BLCK = size_t(1); + std::string READ_TYPE = ""; + GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType(), RD_BLCK, READ_TYPE); size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); size_t grp_sz = (total_work + local_threads - 1) / local_threads; @@ -140,19 +139,12 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, grp_sz2 = std::min(size_t(max_num_wg / grp_sz), grp_sz2); size_t glb_sz2 = local_threads2 * grp_sz2; - const std::vector vld{local_threads, 1, 1}; - const std::vector vgd{glb_sz, glb_sz2, 1}; + const std::array vld{local_threads, 1, 1}; + const std::array vgd{glb_sz, glb_sz2, 1}; - KernelBuildParameters build_params = - KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; + KernelBuildParameters build_params = KernelBuildParameters{}; - switch(problem.GetTensorOp()) - { - case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; - case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break; - case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break; - case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; - } + GetCommonParams(build_params, problem, false); build_params.Define("USE_2D_TENSOR_LITE"); build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); @@ -160,26 +152,31 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, auto kernel = KernelInfo{}; - kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); kernel.kernel_file = "MIOpenTensorKernels.cl"; kernel.kernel_name = "Op2dTensorLite"; - for(uint32_t i = 0; i <= 2; i++) - { - kernel.l_wk.push_back(vld[i]); - kernel.g_wk.push_back(vgd[i]); - } + using std::begin, std::end; - result.invoker_factory = [=](const std::vector kernels) { + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); + + result.invoker_factory = [total_work, total_work2](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(bTensorDesc.GetType(), [&](auto as_float) { + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); + const auto& blens = params.bTensorDesc.GetLengths(); + + const auto& astrides = params.aTensorDesc.GetStrides(); + const auto& bstrides = params.bTensorDesc.GetStrides(); + const auto& cstrides = params.cTensorDesc.GetStrides(); + kernel(params.ATensor, static_cast(astrides[1]), params.BTensor, @@ -192,6 +189,8 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, static_cast(params.Aoffset), static_cast(params.Boffset), static_cast(params.Coffset), + static_cast(total_work), + static_cast(total_work2), static_cast(!float_equal(miopen_beta, 0.0)), static_cast(blens[1] == 1)); }); diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp index 687123ed0a..0368592cf7 100644 --- a/src/solver/tensorOp/Op2dTensorSquash.cpp +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -23,9 +23,8 @@ * SOFTWARE. * *******************************************************************************/ - +#include "tensor_op_helpers.hpp" #include - #include #include #include @@ -41,16 +40,21 @@ namespace tensorOp { bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { - auto aTensorDesc = problem.GetATensorDesc(); - auto bTensorDesc = problem.GetBTensorDesc(); - auto cTensorDesc = problem.GetCTensorDesc(); + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); - auto alens = aTensorDesc.GetLengths(); - auto blens = bTensorDesc.GetLengths(); - auto clens = cTensorDesc.GetLengths(); + const auto& alens = aTensorDesc.GetLengths(); + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); auto asize = alens.size(); + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + if(asize < 3) { return false; @@ -83,17 +87,11 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context, { auto result = ConvSolution{miopenStatusSuccess}; - auto aTensorDesc = problem.GetATensorDesc(); - auto bTensorDesc = problem.GetBTensorDesc(); - auto cTensorDesc = problem.GetCTensorDesc(); - - auto alens = aTensorDesc.GetLengths(); - auto blens = bTensorDesc.GetLengths(); - auto clens = cTensorDesc.GetLengths(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); - auto astrides = aTensorDesc.GetStrides(); - auto bstrides = bTensorDesc.GetStrides(); - auto cstrides = cTensorDesc.GetStrides(); + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); // first_not_one is incorrect if btensor size equal to 1 auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); @@ -117,9 +115,9 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context, size_t local_threads = 256; // for naive tensor ops - size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; - const std::string data_type = GetDataType(bTensorDesc.GetType()); - const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); + size_t RD_BLCK = size_t(1); + std::string READ_TYPE = ""; + GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType(), RD_BLCK, READ_TYPE); size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); size_t grp_sz = (total_work + local_threads - 1) / local_threads; @@ -127,19 +125,12 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context, grp_sz = std::min(size_t(max_num_wg), grp_sz); size_t glb_sz = local_threads * grp_sz; - const std::vector vld{local_threads, 1, 1}; - const std::vector vgd{glb_sz, 1, 1}; + const std::array vld{local_threads, 1, 1}; + const std::array vgd{glb_sz, 1, 1}; - KernelBuildParameters build_params = - KernelBuildParameters{{"MIOPEN_TYPE", GetDataType(bTensorDesc.GetType())}}; + KernelBuildParameters build_params = KernelBuildParameters{}; - switch(problem.GetTensorOp()) - { - case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; - case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break; - case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break; - case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; - } + GetCommonParams(build_params, problem, false); build_params.Define("USE_2D_TENSOR_SQUASH"); build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); @@ -147,26 +138,28 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context, auto kernel = KernelInfo{}; - kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); kernel.kernel_file = "MIOpenTensorKernels.cl"; kernel.kernel_name = "Op2dTensorSquash"; - for(uint32_t i = 0; i <= 2; i++) - { - kernel.l_wk.push_back(vld[i]); - kernel.g_wk.push_back(vgd[i]); - } + using std::begin, std::end; + + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [=](const std::vector kernels) { + result.invoker_factory = [total_work](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(bTensorDesc.GetType(), [&](auto as_float) { + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); + const auto& blens = params.bTensorDesc.GetLengths(); + const auto& bstrides = params.bTensorDesc.GetStrides(); + kernel(params.ATensor, params.BTensor, static_cast(blens[1]), diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp new file mode 100644 index 0000000000..e2734ddd05 --- /dev/null +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -0,0 +1,186 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "tensor_op_helpers.hpp" +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensorOp { + +bool Op3dTensorGeneric::IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& alens = aTensorDesc.GetLengths(); + auto asize = alens.size(); + + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + + if(asize == 3) + { + return true; + } + + return false; +} + +std::size_t +Op3dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution +Op3dTensorGeneric::GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); + + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); + + // first_not_one is incorrect if btensor size equal to 1 + auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); + auto d = std::distance(blens.begin(), first_not_one.base()); + + // quick fix + int num_wg = first_not_one != blens.rend() + ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) + : 1; + + int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); + + unsigned int bitmap = 0; + // update bitmap for first_not_one + bitmap |= (1 << (blens.size() - d)); + + for(int i = (d - 2); i >= 0; i--) + { + if(blens[i] != 1) + { + bitmap |= (1 << (blens.size() - (i + 1))); + num_wg *= blens[i]; + } + else + { + work_per_wg *= clens[i]; + } + } + + int num_wg_orig = num_wg; + int max_num_wg = 4096; + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + + size_t local_threads = 256; + size_t global_threads = num_wg * local_threads; + + const std::array vld{local_threads, 1, 1}; + const std::array vgd{global_threads, 1, 1}; + + KernelBuildParameters build_params = KernelBuildParameters{}; + + GetCommonParams(build_params, problem, false); + + build_params.Define("USE_3D_TENSOR_GENERIC"); + build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + kernel.kernel_name = "Op3dTensorGeneric"; + + using std::begin, std::end; + + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); + + result.invoker_factory = [bitmap, work_per_wg, num_wg_orig](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + const auto& blens = params.bTensorDesc.GetLengths(); + const auto& clens = params.cTensorDesc.GetLengths(); + + const auto& astrides = params.aTensorDesc.GetStrides(); + const auto& bstrides = params.bTensorDesc.GetStrides(); + const auto& cstrides = params.cTensorDesc.GetStrides(); + + kernel(params.ATensor, + static_cast(astrides[0]), + static_cast(astrides[1]), + params.BTensor, + static_cast(blens[1]), + static_cast(blens[2]), + static_cast(bstrides[0]), + static_cast(bstrides[1]), + params.CTensor, + static_cast(clens[1]), + static_cast(clens[2]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + bitmap, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig)); + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensorOp + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp new file mode 100644 index 0000000000..a9446472bf --- /dev/null +++ b/src/solver/tensorOp/tensor_op_helpers.hpp @@ -0,0 +1,77 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensorOp { + +inline void GetCommonParams(KernelBuildParameters& build_params, + miopen::tensorOp::ProblemDescription problem, + bool is64bSupported) +{ + build_params.Define("MIOPEN_TYPE", miopen::GetDataType(problem.GetBTensorDesc().GetType())); + + switch(problem.GetTensorOp()) + { + case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break; + case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break; + case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break; + case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break; + } + + if(is64bSupported && problem.GetATensorDesc().AllDimsFitIntoInt()) + { + build_params.Define("DIM_TYPE", "uint32_t"); + } + else + { + build_params.Define("DIM_TYPE", "uint64_t"); + } + // current workaround + build_params.Define("MIOPEN_USE_FP16", std::to_string(0)); + build_params.Define("MIOPEN_USE_FP32", std::to_string(1)); +} + +inline void +GetRDBLCKandREADTYPE(size_t len, miopenDataType_t type, size_t& RD_BLCK, std::string& READ_TYPE) +{ + RD_BLCK = (len % 4 == 0) ? 4 : (len % 2 == 0) ? 2 : 1; + const std::string data_type = GetDataType(type); + READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); +} + +} // namespace tensorOp + +} // namespace solver + +} // namespace miopen diff --git a/src/tensor.cpp b/src/tensor.cpp index 6c258c0e7d..20973d58be 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -925,7 +925,8 @@ void OpTensor2(Handle& handle, const auto solvers = solver::SolverContainer{} + solver::SolverContainer{} + solver::SolverContainer{} + - solver::SolverContainer{}; + solver::SolverContainer{} + + solver::SolverContainer{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp index c93d1b1eb1..5d33229cef 100644 --- a/test/tensor_ops.cpp +++ b/test/tensor_ops.cpp @@ -241,7 +241,7 @@ struct tensor_ops_driver : test_driver std::vector> get_sub_tensor_a() { - return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8},*/ {16, 8}, {8}}; + return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, */ {20, 16, 8}, {1, 16, 8}, {16, 8}, {8}}; } std::vector> get_sub_tensor_b() @@ -258,12 +258,12 @@ struct tensor_ops_driver : test_driver {1, 20, 16, 1}, {1, 20, 1, 1}, {1, 1, 16, 8}, - {1, 1, 1, 8}, + {1, 1, 1, 8}, */ {20, 16, 8}, {20, 16, 1}, {1, 16, 8}, {1, 16, 1}, - {20, 1, 1},*/ + {20, 1, 1}, {16, 8}, {16, 1}, {1, 8}, From cadb2649c1f29f8d916b365213421a3733c6f543 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Tue, 5 Nov 2024 19:10:16 +0200 Subject: [PATCH 07/25] initial switch to solver structure for all kernels, still need to separate some of them into unique solvers and tidy the code --- src/CMakeLists.txt | 5 + src/include/miopen/tensorOp/solvers.hpp | 85 +++++++ src/kernels/MIOpenTensorKernels.cl | 18 -- src/solver/tensorOp/Op2dTensorLite.cpp | 18 +- src/solver/tensorOp/Op2dTensorSquash.cpp | 18 +- src/solver/tensorOp/Op3dTensorGeneric.cpp | 28 +-- src/solver/tensorOp/Op4dTensorGeneric.cpp | 161 ++++++++++++++ src/solver/tensorOp/Op4dTensorLite.cpp | 175 +++++++++++++++ src/solver/tensorOp/Op5dTensorGeneric.cpp | 174 +++++++++++++++ src/solver/tensorOp/OpTensorFwdBias.cpp | 214 ++++++++++++++++++ src/solver/tensorOp/OpTensorLeadingOnes.cpp | 232 ++++++++++++++++++++ src/solver/tensorOp/tensor_op_helpers.hpp | 146 +++++++++++- src/tensor.cpp | 11 +- test/tensor_ops.cpp | 6 +- 14 files changed, 1211 insertions(+), 80 deletions(-) create mode 100644 src/solver/tensorOp/Op4dTensorGeneric.cpp create mode 100644 src/solver/tensorOp/Op4dTensorLite.cpp create mode 100644 src/solver/tensorOp/Op5dTensorGeneric.cpp create mode 100644 src/solver/tensorOp/OpTensorFwdBias.cpp create mode 100644 src/solver/tensorOp/OpTensorLeadingOnes.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 85912d3b6f..4f1096001a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -342,6 +342,11 @@ set( MIOpen_Source solver/tensorOp/Op2dTensorLite.cpp solver/tensorOp/Op2dTensorSquash.cpp solver/tensorOp/Op3dTensorGeneric.cpp + solver/tensorOp/OpTensorFwdBias.cpp + solver/tensorOp/Op4dTensorLite.cpp + solver/tensorOp/OpTensorLeadingOnes.cpp + solver/tensorOp/Op4dTensorGeneric.cpp + solver/tensorOp/Op5dTensorGeneric.cpp subbuffers.cpp t5layernorm_api.cpp target_properties.cpp diff --git a/src/include/miopen/tensorOp/solvers.hpp b/src/include/miopen/tensorOp/solvers.hpp index 9eb9e187b6..635d0ab777 100644 --- a/src/include/miopen/tensorOp/solvers.hpp +++ b/src/include/miopen/tensorOp/solvers.hpp @@ -124,6 +124,91 @@ struct Op3dTensorGeneric final : TensorOpSolver bool MayNeedWorkspace() const override { return false; } }; +struct OpTensorFwdBias final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + +struct Op4dTensorLite final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + +struct OpTensorLeadingOnes final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + +struct Op4dTensorGeneric final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + +struct Op5dTensorGeneric final : TensorOpSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + } // namespace tensorOp } // namespace solver diff --git a/src/kernels/MIOpenTensorKernels.cl b/src/kernels/MIOpenTensorKernels.cl index da998696ac..3c21267e57 100644 --- a/src/kernels/MIOpenTensorKernels.cl +++ b/src/kernels/MIOpenTensorKernels.cl @@ -24,24 +24,6 @@ * *******************************************************************************/ -#if MIOPEN_USE_FP16 == 1 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define _FLOAT half -#ifndef HALF_MAX -#define MAX_VAL 65504 /* max value */ -#else -#define MAX_VAL HALF_MAX -#endif -#endif -#if MIOPEN_USE_FP32 == 1 -#define _FLOAT float -#ifndef FLT_MAX -#define MAX_VAL 3.402823466e+38F /* max value */ -#else -#define MAX_VAL FLT_MAX -#endif -#endif - /* Only works for NCHW * bitmap tracks which dims are the same between 'a' and 'c'. * Example: 0, 1, 1, 0 means that C and H dims are the same and the rest are ones diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp index 8782fe1b29..6dedb553e5 100644 --- a/src/solver/tensorOp/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -101,22 +101,12 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); - // first_not_one is incorrect if btensor size equal to 1 - auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); - auto d = std::distance(blens.begin(), first_not_one.base()); + int num_wg = 0; + int work_per_wg = 0; + unsigned int bitmap = 0; - // quick fix - int num_wg = first_not_one != blens.rend() - ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) - : 1; + GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); - for(int i = (d - 2); i >= 0; i--) - { - if(blens[i] != 1) - { - num_wg *= blens[i]; - } - } int max_num_wg = 4096; num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp index 0368592cf7..2cb95d1088 100644 --- a/src/solver/tensorOp/Op2dTensorSquash.cpp +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -93,22 +93,12 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context, const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); - // first_not_one is incorrect if btensor size equal to 1 - auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); - auto d = std::distance(blens.begin(), first_not_one.base()); + int num_wg = 0; + int work_per_wg = 0; + unsigned int bitmap = 0; - // quick fix - int num_wg = first_not_one != blens.rend() - ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) - : 1; + GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); - for(int i = (d - 2); i >= 0; i--) - { - if(blens[i] != 1) - { - num_wg *= blens[i]; - } - } int max_num_wg = 4096; num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index e2734ddd05..252c8f937b 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -76,33 +76,11 @@ Op3dTensorGeneric::GetSolution(const ExecutionContext& context, const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); - // first_not_one is incorrect if btensor size equal to 1 - auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); - auto d = std::distance(blens.begin(), first_not_one.base()); - - // quick fix - int num_wg = first_not_one != blens.rend() - ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) - : 1; - - int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); - + int num_wg = 0; + int work_per_wg = 0; unsigned int bitmap = 0; - // update bitmap for first_not_one - bitmap |= (1 << (blens.size() - d)); - for(int i = (d - 2); i >= 0; i--) - { - if(blens[i] != 1) - { - bitmap |= (1 << (blens.size() - (i + 1))); - num_wg *= blens[i]; - } - else - { - work_per_wg *= clens[i]; - } - } + GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); int num_wg_orig = num_wg; int max_num_wg = 4096; diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp new file mode 100644 index 0000000000..fb50c37a69 --- /dev/null +++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp @@ -0,0 +1,161 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "tensor_op_helpers.hpp" +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensorOp { + +bool Op4dTensorGeneric::IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& alens = aTensorDesc.GetLengths(); + auto asize = alens.size(); + + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + + if(asize == 4) + { + return true; + } + + return false; +} + +std::size_t +Op4dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution +Op4dTensorGeneric::GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + int max_num_wg = 4096; + int num_wg_orig = 0; + int work_per_wg = 0; + int incr_wg = 0; + unsigned int bitmap = 0; + + size_t local_threads = 0; + size_t global_threads = 0; + + Get4dParams( + problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); + + const std::array vld{local_threads, 1, 1}; + const std::array vgd{global_threads, 1, 1}; + + KernelBuildParameters build_params = KernelBuildParameters{}; + + GetCommonParams(build_params, problem, false); + + build_params.Define("USE_4D_TENSOR_GENERIC"); + build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + kernel.kernel_name = "Op4dTensorGeneric"; + + using std::begin, std::end; + + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); + + result.invoker_factory = [work_per_wg, num_wg_orig, bitmap](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + const auto& blens = params.bTensorDesc.GetLengths(); + const auto& clens = params.cTensorDesc.GetLengths(); + + const auto& astrides = params.aTensorDesc.GetStrides(); + const auto& bstrides = params.bTensorDesc.GetStrides(); + const auto& cstrides = params.cTensorDesc.GetStrides(); + + kernel(params.ATensor, + static_cast(astrides[0]), // a_nstride, + static_cast(astrides[1]), // a_cstride, + static_cast(astrides[2]), // a_hstride, + params.BTensor, + static_cast(blens[1]), // b_c, + static_cast(blens[2]), // b_h, + static_cast(blens[3]), // b_w, + static_cast(bstrides[0]), // b_nstride, + static_cast(bstrides[1]), // b_cstride, + static_cast(bstrides[2]), // b_hstride, + params.CTensor, + static_cast(clens[1]), // c_c, + static_cast(clens[2]), // c_h, + static_cast(clens[3]), // c_w, + static_cast(cstrides[0]), // c_nstride, + static_cast(cstrides[1]), // c_cstride, + static_cast(cstrides[2]), // c_hstride, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + bitmap, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig)); + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensorOp + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp new file mode 100644 index 0000000000..d0431cb1f9 --- /dev/null +++ b/src/solver/tensorOp/Op4dTensorLite.cpp @@ -0,0 +1,175 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "tensor_op_helpers.hpp" +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensorOp { + +bool Op4dTensorLite::IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); + + const auto& alens = aTensorDesc.GetLengths(); + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); + + auto asize = alens.size(); + + int num_wg = 0; + int work_per_wg = 0; + unsigned int bitmap = 0; + + GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); + + // quick fix for btensor = <1, 1, 1, 1> + if(bTensorDesc.GetElementSize() == 1) + bitmap = 4; + + auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + + bool packed_tensor = true; + + // auto alens = aTensorDesc.GetLengths(); + packed_tensor &= aTensorDesc.IsPacked(); + packed_tensor &= bTensorDesc.IsPacked(); + packed_tensor &= cTensorDesc.IsPacked(); + + bool packed_equal_tensor = + packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize()); + + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + + if(asize == 4 && fwd_conv_bias == 0 && packed_equal_tensor) + { + return true; + } + + return false; +} + +std::size_t +Op4dTensorLite::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution Op4dTensorLite::GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); + + int num_wg_orig = 0; + int work_per_wg = 0; + int incr_wg = 0; + unsigned int bitmap = 0; + + size_t local_threads = 0; + size_t global_threads = 0; + + Get4dParams( + problem, true, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); + + size_t RD_BLCK = size_t(1); + std::string READ_TYPE = ""; + GetRDBLCKandREADTYPE(cTensorDesc.GetElementSize(), bTensorDesc.GetType(), RD_BLCK, READ_TYPE); + + size_t total_work = std::max(cTensorDesc.GetElementSize() / RD_BLCK, size_t(1)); + + const std::array vld{local_threads, 1, 1}; + const std::array vgd{global_threads, 1, 1}; + + KernelBuildParameters build_params = KernelBuildParameters{}; + + GetCommonParams(build_params, problem, false); + + build_params.Define("USE_4D_TENSOR_LITE"); + build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); + build_params.Define("READ_TYPE", READ_TYPE); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + kernel.kernel_name = "Op4dTensorLite"; + + using std::begin, std::end; + + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); + + result.invoker_factory = [total_work](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + kernel(params.ATensor, + params.BTensor, + params.CTensor, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(total_work), + static_cast(!float_equal(miopen_beta, 0.0))); + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensorOp + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp new file mode 100644 index 0000000000..1756111326 --- /dev/null +++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp @@ -0,0 +1,174 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "tensor_op_helpers.hpp" +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensorOp { + +bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& alens = aTensorDesc.GetLengths(); + auto asize = alens.size(); + + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + + if(asize == 5) + { + return true; + } + return false; +} + +std::size_t +Op5dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution +Op5dTensorGeneric::GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const auto& cTensorDesc = problem.GetCTensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); + + int num_wg = 0; + int work_per_wg = 0; + unsigned int bitmap = 0; + + GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); + + int num_wg_orig = num_wg; + int max_num_wg = 4096; + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + + size_t local_threads = 256; + size_t global_threads = num_wg * local_threads; + + const std::array vld{local_threads, 1, 1}; + const std::array vgd{global_threads, 1, 1}; + + KernelBuildParameters build_params = KernelBuildParameters{}; + + GetCommonParams(build_params, problem, false); + + build_params.Define("USE_5D_TENSOR_GENERIC"); + build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + kernel.kernel_name = "Op5dTensorGeneric"; + + using std::begin, std::end; + + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); + + result.invoker_factory = [bitmap, work_per_wg, num_wg_orig](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + const auto& blens = params.bTensorDesc.GetLengths(); + const auto& clens = params.cTensorDesc.GetLengths(); + + const auto& astrides = params.aTensorDesc.GetStrides(); + const auto& bstrides = params.bTensorDesc.GetStrides(); + const auto& cstrides = params.cTensorDesc.GetStrides(); + + kernel(params.ATensor, + static_cast(astrides[0]), + static_cast(astrides[1]), + static_cast(astrides[2]), + static_cast(astrides[3]), + params.BTensor, + static_cast(blens[1]), // b_c, + static_cast(blens[2]), // b_d, + static_cast(blens[3]), // b_h, + static_cast(blens[4]), // b_w, + static_cast(bstrides[0]), // b_nstride, + static_cast(bstrides[1]), // b_cstride, + static_cast(bstrides[2]), // b_dstride, + static_cast(bstrides[3]), // b_hstride, + params.CTensor, + static_cast(clens[1]), // c_c, + static_cast(clens[2]), // c_d, + static_cast(clens[3]), // c_h, + static_cast(clens[4]), // c_w, + static_cast(cstrides[0]), // c_nstride, + static_cast(cstrides[1]), // c_cstride, + static_cast(cstrides[2]), // c_dstride, + static_cast(cstrides[3]), // c_hstride, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + bitmap, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig)); + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensorOp + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp new file mode 100644 index 0000000000..4f304d9fdb --- /dev/null +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -0,0 +1,214 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "tensor_op_helpers.hpp" +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensorOp { + +bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); + + const auto& alens = aTensorDesc.GetLengths(); + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); + + auto asize = alens.size(); + + int num_wg = 0; + int work_per_wg = 0; + unsigned int bitmap = 0; + + GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); + + // quick fix for btensor = <1, 1, 1, 1> + if(bTensorDesc.GetElementSize() == 1) + bitmap = 4; + + auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + + if(asize == 4 && fwd_conv_bias != 0) + { + return true; + } + + return false; +} + +std::size_t +OpTensorFwdBias::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); + + int max_num_wg = 4096; + int num_wg_orig = 0; + int work_per_wg = 0; + int incr_wg = 0; + unsigned int bitmap = 0; + + size_t local_threads = 0; + size_t global_threads = 0; + + Get4dParams( + problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); + + const std::array vld{local_threads, 1, 1}; + const std::array vgd{global_threads, 1, 1}; + + bool packed_tensor = true; + packed_tensor &= aTensorDesc.IsPacked(); + packed_tensor &= bTensorDesc.IsPacked(); + packed_tensor &= cTensorDesc.IsPacked(); + + KernelBuildParameters build_params = KernelBuildParameters{}; + + GetCommonParams(build_params, problem, false); + + build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + if(packed_tensor) + { + build_params.Define("USE_FWD_BIAS"); + kernel.kernel_name = "OpTensorFwdBias"; + } + else + { + build_params.Define("USE_FWD_BIAS_GENERIC"); + kernel.kernel_name = "OpTensorFwdBiasGeneric"; + } + + using std::begin, std::end; + + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); + + result.invoker_factory = + [work_per_wg, num_wg_orig, incr_wg, packed_tensor](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + const auto& blens = params.bTensorDesc.GetLengths(); + const auto& clens = params.cTensorDesc.GetLengths(); + + const auto& astrides = params.aTensorDesc.GetStrides(); + const auto& bstrides = params.bTensorDesc.GetStrides(); + const auto& cstrides = params.cTensorDesc.GetStrides(); + + if(packed_tensor) + { + kernel(params.ATensor, + params.BTensor, + static_cast(blens[1]), + params.CTensor, + static_cast(clens[0]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + work_per_wg, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig), + static_cast(incr_wg)); + } + else + { + kernel(params.ATensor, + static_cast(astrides[0]), + static_cast(astrides[1]), + static_cast(astrides[2]), + params.BTensor, + static_cast(blens[1]), + static_cast(bstrides[1]), + params.CTensor, + static_cast(clens[0]), + static_cast(clens[3]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + static_cast(cstrides[2]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig), + static_cast(incr_wg)); + } + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensorOp + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp new file mode 100644 index 0000000000..77b67e9a76 --- /dev/null +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -0,0 +1,232 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "tensor_op_helpers.hpp" +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace tensorOp { + +bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); + + const auto& alens = aTensorDesc.GetLengths(); + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); + + auto asize = alens.size(); + + int num_wg = 0; + int work_per_wg = 0; + unsigned int bitmap = 0; + + GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); + + // quick fix for btensor = <1, 1, 1, 1> + if(bTensorDesc.GetElementSize() == 1) + bitmap = 4; + + auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + + bool packed_tensor = true; + packed_tensor &= aTensorDesc.IsPacked(); + packed_tensor &= bTensorDesc.IsPacked(); + packed_tensor &= cTensorDesc.IsPacked(); + + bool packed_equal_tensor = + packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize()); + + bool leading_ones = true; + // first_not_one is incorrect if btensor size equal to 1 + auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); + auto d = std::distance(blens.begin(), first_not_one.base()); + + IsBitmapLeadingOnes(bitmap, clens.size(), static_cast(d - 2), leading_ones); + + if(GetDataType(aTensorDesc.GetType()) == "double") + { + return false; + } + + if(asize == 4 && fwd_conv_bias == 0 && !packed_equal_tensor && leading_ones) + { + return true; + } + + return false; +} + +std::size_t +OpTensorLeadingOnes::GetWorkspaceSize(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution +OpTensorLeadingOnes::GetSolution(const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); + + int max_num_wg = 4096; + int num_wg_orig = 0; + int work_per_wg = 0; + int incr_wg = 0; + unsigned int bitmap = 0; + + size_t local_threads = 0; + size_t global_threads = 0; + + Get4dParams( + problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); + + const std::array vld{local_threads, 1, 1}; + const std::array vgd{global_threads, 1, 1}; + + bool packed_tensor = true; + packed_tensor &= aTensorDesc.IsPacked(); + packed_tensor &= bTensorDesc.IsPacked(); + packed_tensor &= cTensorDesc.IsPacked(); + + KernelBuildParameters build_params = KernelBuildParameters{}; + + GetCommonParams(build_params, problem, false); + + build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + if(packed_tensor) + { + build_params.Define("USE_LEADING_ONES"); + kernel.kernel_name = "OpTensorLeadingOnes"; + } + else + { + build_params.Define("USE_LEADING_ONES_GENERIC"); + kernel.kernel_name = "OpTensorLeadingOnesGeneric"; + } + + using std::begin, std::end; + + kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); + kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); + + result.invoker_factory = + [work_per_wg, num_wg_orig, bitmap, packed_tensor](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + const auto& clens = params.cTensorDesc.GetLengths(); + + const auto& astrides = params.aTensorDesc.GetStrides(); + const auto& bstrides = params.bTensorDesc.GetStrides(); + const auto& cstrides = params.cTensorDesc.GetStrides(); + + if(packed_tensor) + { + kernel(params.ATensor, + params.BTensor, + params.CTensor, + static_cast(clens[1]), + static_cast(clens[2]), + static_cast(clens[3]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + work_per_wg, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig), + bitmap); + } + else + { + kernel(params.ATensor, + static_cast(astrides[0]), + static_cast(astrides[1]), + static_cast(astrides[2]), + params.BTensor, + static_cast(bstrides[0]), + static_cast(bstrides[1]), + static_cast(bstrides[2]), + params.CTensor, + static_cast(clens[1]), + static_cast(clens[2]), + static_cast(clens[3]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + static_cast(cstrides[2]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig), + bitmap); + } + }); + }; + }; + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace tensorOp + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp index a9446472bf..02b1a137d0 100644 --- a/src/solver/tensorOp/tensor_op_helpers.hpp +++ b/src/solver/tensorOp/tensor_op_helpers.hpp @@ -57,9 +57,6 @@ inline void GetCommonParams(KernelBuildParameters& build_params, { build_params.Define("DIM_TYPE", "uint64_t"); } - // current workaround - build_params.Define("MIOPEN_USE_FP16", std::to_string(0)); - build_params.Define("MIOPEN_USE_FP32", std::to_string(1)); } inline void @@ -70,6 +67,149 @@ GetRDBLCKandREADTYPE(size_t len, miopenDataType_t type, size_t& RD_BLCK, std::st READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); } +inline void GetBitmapAndWgInfo(const std::vector& blens, + const std::vector& clens, + int& num_wg, + int& work_per_wg, + unsigned int& bitmap) +{ + // first_not_one is incorrect if btensor size equal to 1 + auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); + auto d = std::distance(blens.begin(), first_not_one.base()); + + // quick fix + num_wg = first_not_one != blens.rend() + ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) + : 1; + + work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); + + // update bitmap for first_not_one + bitmap |= (1 << (blens.size() - d)); + + for(int i = (d - 2); i >= 0; i--) + { + if(blens[i] != 1) + { + bitmap |= (1 << (blens.size() - (i + 1))); + num_wg *= blens[i]; + } + else + { + work_per_wg *= clens[i]; + } + } +} + +inline void +IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_one, bool& leading_ones) +{ + for(int i = first_not_one; i >= 0; i--) + { + bool is_one = (bitmap & (1 << (n_size - 1 - i))) != 0u; + leading_ones &= is_one; + } +} + +inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem, + bool is4dLite, + int& num_wg_orig, + int& work_per_wg, + int& incr_wg, + unsigned int& bitmap, + size_t& local_threads, + size_t& global_threads) +{ + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); + + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); + + auto dims = clens.size(); + + // first_not_one is incorrect if btensor size equal to 1 + auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); + auto d = std::distance(blens.begin(), first_not_one.base()); + + // quick fix + int num_wg = first_not_one != blens.rend() + ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) + : 1; + + work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); + + // update bitmap for first_not_one + bitmap |= (1 << (blens.size() - d)); + + for(int i = (d - 2); i >= 0; i--) + { + if(blens[i] != 1) + { + bitmap |= (1 << (blens.size() - (i + 1))); + num_wg *= blens[i]; + } + else + { + work_per_wg *= clens[i]; + } + } + + // quick fix for btensor = <1, 1, 1, 1> + if(bTensorDesc.GetElementSize() == 1) + bitmap = 4; + + // Forward Convolution Bias specialization + // for fwd-bias, bitmap looks like <0, 1, 0, 0> + // Is the no. of work-groups and the work for each wg balanced? + auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + // This block gives off indexing for 5d tensors, skipping + if(fwd_conv_bias == 1 && dims < 5 && num_wg < 640 && work_per_wg > 256 && clens[0] > 0) + { // 640 workgroups of size 256 needed to completely fill the GPU + + work_per_wg /= clens[0]; // c_n; + num_wg *= clens[0]; // c_n; + incr_wg = 1; + } + + num_wg_orig = num_wg; + int max_num_wg = 4096; + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + + local_threads = 256; + + bool leading_ones = true; + IsBitmapLeadingOnes(bitmap, clens.size(), static_cast(d - 2), leading_ones); + + if(leading_ones && work_per_wg < 64) + { + local_threads = 64; + } + + // Special case for adding tensors in place + global_threads = + (static_cast(leading_ones) == 1 && (d - 1) == 3) ? num_wg : num_wg * local_threads; + global_threads = (global_threads < local_threads) ? local_threads : global_threads; + + if(is4dLite) + { + // for naive tensor ops + const std::string data_type = GetDataType(bTensorDesc.GetType()); + + size_t TENS_LEN = cTensorDesc.GetElementSize(); + size_t RD_BLCK = (TENS_LEN % 4 == 0) ? 4 : (TENS_LEN % 2 == 0) ? 2 : 1; + const std::string READ_TYPE = + (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); + + size_t total_work = std::max(TENS_LEN / RD_BLCK, size_t(1)); + size_t grp_sz = (total_work + local_threads - 1) / local_threads; + grp_sz = std::min(size_t(max_num_wg), grp_sz); + size_t glb_sz = local_threads * grp_sz; + + global_threads = glb_sz; + } +} + } // namespace tensorOp } // namespace solver diff --git a/src/tensor.cpp b/src/tensor.cpp index 20973d58be..f65a1a408e 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -922,11 +922,16 @@ void OpTensor2(Handle& handle, nonStandardSquash}; const auto algo = AlgorithmName{"TensorOpSolver"}; - const auto solvers = solver::SolverContainer{} + + const auto solvers = solver::SolverContainer{} + + solver::SolverContainer{} + + solver::SolverContainer{} + + solver::SolverContainer{} + solver::SolverContainer{} + - solver::SolverContainer{} + + solver::SolverContainer{} + + solver::SolverContainer{} + + solver::SolverContainer{} + solver::SolverContainer{} + - solver::SolverContainer{}; + solver::SolverContainer{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp index 5d33229cef..ee42a94ea6 100644 --- a/test/tensor_ops.cpp +++ b/test/tensor_ops.cpp @@ -241,12 +241,12 @@ struct tensor_ops_driver : test_driver std::vector> get_sub_tensor_a() { - return {/*{32, 16, 8, 4, 4}, {16, 20, 16, 8}, */ {20, 16, 8}, {1, 16, 8}, {16, 8}, {8}}; + return {{32, 16, 8, 4, 4}, {16, 20, 16, 8}, {20, 16, 8}, {1, 16, 8}, {16, 8}, {8}}; } std::vector> get_sub_tensor_b() { - return {/*{32, 16, 8, 4, 4}, + return {{32, 16, 8, 4, 4}, {32, 16, 1, 1, 1}, {1, 16, 8, 1, 1}, {1, 1, 8, 4, 1}, @@ -258,7 +258,7 @@ struct tensor_ops_driver : test_driver {1, 20, 16, 1}, {1, 20, 1, 1}, {1, 1, 16, 8}, - {1, 1, 1, 8}, */ + {1, 1, 1, 8}, {20, 16, 8}, {20, 16, 1}, {1, 16, 8}, From 63603f02fbd03243b088f04d8de261cd40a61a75 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Wed, 6 Nov 2024 14:03:58 +0200 Subject: [PATCH 08/25] fix for two kernels in one solver --- src/solver/tensorOp/OpTensorFwdBias.cpp | 5 +++-- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index 4f304d9fdb..c2aa07c8ae 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -119,8 +119,6 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context, auto kernel = KernelInfo{}; - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); - kernel.kernel_file = "MIOpenTensorKernels.cl"; if(packed_tensor) { build_params.Define("USE_FWD_BIAS"); @@ -132,6 +130,9 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context, kernel.kernel_name = "OpTensorFwdBiasGeneric"; } + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + using std::begin, std::end; kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index 77b67e9a76..8f4a4399ce 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -132,11 +132,8 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context, GetCommonParams(build_params, problem, false); build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); - auto kernel = KernelInfo{}; - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); - kernel.kernel_file = "MIOpenTensorKernels.cl"; if(packed_tensor) { build_params.Define("USE_LEADING_ONES"); @@ -148,6 +145,9 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context, kernel.kernel_name = "OpTensorLeadingOnesGeneric"; } + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + kernel.kernel_file = "MIOpenTensorKernels.cl"; + using std::begin, std::end; kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); From 976bd84f1234ff7c1906db3b85e4f60b37346769 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Thu, 7 Nov 2024 15:29:44 +0200 Subject: [PATCH 09/25] additional changes --- src/solver/tensorOp/Op1dTensorGeneric.cpp | 17 +----- src/solver/tensorOp/Op2dTensorGeneric.cpp | 7 +-- src/solver/tensorOp/Op2dTensorLite.cpp | 44 ++++++-------- src/solver/tensorOp/Op2dTensorSquash.cpp | 33 +++++------ src/solver/tensorOp/Op3dTensorGeneric.cpp | 8 +-- src/solver/tensorOp/Op4dTensorGeneric.cpp | 13 +---- src/solver/tensorOp/Op4dTensorLite.cpp | 62 ++++++++------------ src/solver/tensorOp/Op5dTensorGeneric.cpp | 9 +-- src/solver/tensorOp/OpTensorFwdBias.cpp | 41 ++++++-------- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 63 +++++++++------------ src/solver/tensorOp/tensor_op_helpers.hpp | 60 ++++++++++---------- src/tensorOp/problem_description.cpp | 43 +++++++------- 12 files changed, 162 insertions(+), 238 deletions(-) diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index 640f9968e2..a149f488e2 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -42,12 +42,10 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); - // const auto& bTensorDesc = problem.GetBTensorDesc(); - const auto& alens = aTensorDesc.GetLengths(); - // const auto& blens = bTensorDesc.GetLengths(); + const auto& alens = aTensorDesc.GetLengths(); auto asize = alens.size(); - if(GetDataType(aTensorDesc.GetType()) == "double") + if(aTensorDesc.GetType() == miopenDouble) { return false; } @@ -56,16 +54,7 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, { return true; } - // add support for this later - // if(asize == 2 && ((blens[0] == 1 && blens[1] == 1) || (blens[0] > 1 && blens[1] > 1))) - // { - // return true; - // } - // if(asize == 3 && ((blens[0] == 1 && blens[1] == 1 && blens[2] == 1) || - // (blens[0] > 1 && blens[1] > 1 && blens[2] > 1))) - // { - // return true; - // } + return false; } diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index 92a19d6a99..3f4b03c2d1 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -41,12 +41,10 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); - // const auto& bTensorDesc = problem.GetBTensorDesc(); - const auto& alens = aTensorDesc.GetLengths(); - // const auto& blens = bTensorDesc.GetLengths(); + const auto& alens = aTensorDesc.GetLengths(); auto asize = alens.size(); - if(GetDataType(aTensorDesc.GetType()) == "double") + if(aTensorDesc.GetType() == miopenDouble) { return false; } @@ -55,7 +53,6 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context, { return true; } - // add applicable when asize == 3 and some special cases for b dimensions return false; } diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp index 6dedb553e5..d76c4f57f6 100644 --- a/src/solver/tensorOp/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -51,33 +51,31 @@ bool Op2dTensorLite::IsApplicable(const ExecutionContext& context, auto asize = alens.size(); - if(GetDataType(aTensorDesc.GetType()) == "double") + if(aTensorDesc.GetType() == miopenDouble) { return false; } - if(asize < 3) + if(asize == 3) { - return false; - } + size_t local_threads = 256; + int max_num_wg = 4096; - size_t local_threads = 256; - int max_num_wg = 4096; + // for naive tensor ops + size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; + size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); + size_t grp_sz = (total_work + local_threads - 1) / local_threads; - // for naive tensor ops - size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; - size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); - size_t grp_sz = (total_work + local_threads - 1) / local_threads; - - // opencl kernels are no longer supported, fallback to generic case - bool lite_applicable = grp_sz <= size_t(max_num_wg); + // opencl kernels are no longer supported, fallback to generic case + bool lite_applicable = grp_sz <= size_t(max_num_wg); - bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 && - (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2]; + bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 && + (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2]; - if(asize == 3 && lite_applicable && is_lite) - { - return true; + if(lite_applicable && is_lite) + { + return true; + } } return false; @@ -101,11 +99,7 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); - int num_wg = 0; - int work_per_wg = 0; - unsigned int bitmap = 0; - - GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); int max_num_wg = 4096; num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; @@ -113,9 +107,7 @@ ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, size_t local_threads = 256; // for naive tensor ops - size_t RD_BLCK = size_t(1); - std::string READ_TYPE = ""; - GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType(), RD_BLCK, READ_TYPE); + auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType()); size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); size_t grp_sz = (total_work + local_threads - 1) / local_threads; diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp index 2cb95d1088..93f2868905 100644 --- a/src/solver/tensorOp/Op2dTensorSquash.cpp +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -50,25 +50,24 @@ bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context, auto asize = alens.size(); - if(GetDataType(aTensorDesc.GetType()) == "double") + if(aTensorDesc.GetType() == miopenDouble) { return false; } - if(asize < 3) + if(asize == 3) { - return false; - } - - bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 && - (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2]; + bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 && + (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2]; - bool is_squashed = problem.GetNonStandardSquash() && !is_lite && - (blens[0] == 1 && clens[0] == 1 && clens[1] == 1 && blens[2] == clens[2]); + bool is_squashed = + problem.GetNonStandardSquash() && !is_lite && + (blens[0] == 1 && clens[0] == 1 && clens[1] == 1 && blens[2] == clens[2]); - if(asize == 3 && is_squashed) - { - return true; + if(is_squashed) + { + return true; + } } return false; @@ -93,11 +92,7 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context, const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); - int num_wg = 0; - int work_per_wg = 0; - unsigned int bitmap = 0; - - GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); int max_num_wg = 4096; num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; @@ -105,9 +100,7 @@ Op2dTensorSquash::GetSolution(const ExecutionContext& context, size_t local_threads = 256; // for naive tensor ops - size_t RD_BLCK = size_t(1); - std::string READ_TYPE = ""; - GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType(), RD_BLCK, READ_TYPE); + auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType()); size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); size_t grp_sz = (total_work + local_threads - 1) / local_threads; diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index 252c8f937b..1aeb83509d 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -44,7 +44,7 @@ bool Op3dTensorGeneric::IsApplicable(const ExecutionContext& context, const auto& alens = aTensorDesc.GetLengths(); auto asize = alens.size(); - if(GetDataType(aTensorDesc.GetType()) == "double") + if(aTensorDesc.GetType() == miopenDouble) { return false; } @@ -76,11 +76,7 @@ Op3dTensorGeneric::GetSolution(const ExecutionContext& context, const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); - int num_wg = 0; - int work_per_wg = 0; - unsigned int bitmap = 0; - - GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); int num_wg_orig = num_wg; int max_num_wg = 4096; diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp index fb50c37a69..ad17bf8791 100644 --- a/src/solver/tensorOp/Op4dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp @@ -44,7 +44,7 @@ bool Op4dTensorGeneric::IsApplicable(const ExecutionContext& context, const auto& alens = aTensorDesc.GetLengths(); auto asize = alens.size(); - if(GetDataType(aTensorDesc.GetType()) == "double") + if(aTensorDesc.GetType() == miopenDouble) { return false; } @@ -71,16 +71,9 @@ Op4dTensorGeneric::GetSolution(const ExecutionContext& context, auto result = ConvSolution{miopenStatusSuccess}; int max_num_wg = 4096; - int num_wg_orig = 0; - int work_per_wg = 0; - int incr_wg = 0; - unsigned int bitmap = 0; - size_t local_threads = 0; - size_t global_threads = 0; - - Get4dParams( - problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); + auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = Get4dParams( + problem, false); const std::array vld{local_threads, 1, 1}; const std::array vgd{global_threads, 1, 1}; diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp index d0431cb1f9..588a2c6ef9 100644 --- a/src/solver/tensorOp/Op4dTensorLite.cpp +++ b/src/solver/tensorOp/Op4dTensorLite.cpp @@ -50,36 +50,33 @@ bool Op4dTensorLite::IsApplicable(const ExecutionContext& context, auto asize = alens.size(); - int num_wg = 0; - int work_per_wg = 0; - unsigned int bitmap = 0; - - GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); - - // quick fix for btensor = <1, 1, 1, 1> - if(bTensorDesc.GetElementSize() == 1) - bitmap = 4; + if(aTensorDesc.GetType() == miopenDouble) + { + return false; + } - auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + if(asize == 4) + { + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); - bool packed_tensor = true; + // quick fix for btensor = <1, 1, 1, 1> + if(bTensorDesc.GetElementSize() == 1) + bitmap = 4; - // auto alens = aTensorDesc.GetLengths(); - packed_tensor &= aTensorDesc.IsPacked(); - packed_tensor &= bTensorDesc.IsPacked(); - packed_tensor &= cTensorDesc.IsPacked(); + bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; - bool packed_equal_tensor = - packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize()); + bool packed_tensor = true; + packed_tensor &= aTensorDesc.IsPacked(); + packed_tensor &= bTensorDesc.IsPacked(); + packed_tensor &= cTensorDesc.IsPacked(); - if(GetDataType(aTensorDesc.GetType()) == "double") - { - return false; - } + bool packed_equal_tensor = + packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize()); - if(asize == 4 && fwd_conv_bias == 0 && packed_equal_tensor) - { - return true; + if(fwd_conv_bias == 0 && packed_equal_tensor) + { + return true; + } } return false; @@ -100,20 +97,11 @@ ConvSolution Op4dTensorLite::GetSolution(const ExecutionContext& context, const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); - int num_wg_orig = 0; - int work_per_wg = 0; - int incr_wg = 0; - unsigned int bitmap = 0; - - size_t local_threads = 0; - size_t global_threads = 0; - - Get4dParams( - problem, true, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); + auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = + Get4dParams(problem, true); - size_t RD_BLCK = size_t(1); - std::string READ_TYPE = ""; - GetRDBLCKandREADTYPE(cTensorDesc.GetElementSize(), bTensorDesc.GetType(), RD_BLCK, READ_TYPE); + auto&& [RD_BLCK, READ_TYPE] = + GetRDBLCKandREADTYPE(cTensorDesc.GetElementSize(), bTensorDesc.GetType()); size_t total_work = std::max(cTensorDesc.GetElementSize() / RD_BLCK, size_t(1)); diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp index 1756111326..b0cb0397e0 100644 --- a/src/solver/tensorOp/Op5dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp @@ -45,7 +45,7 @@ bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context, const auto& alens = aTensorDesc.GetLengths(); auto asize = alens.size(); - if(GetDataType(aTensorDesc.GetType()) == "double") + if(aTensorDesc.GetType() == miopenDouble) { return false; } @@ -54,6 +54,7 @@ bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context, { return true; } + return false; } @@ -76,11 +77,7 @@ Op5dTensorGeneric::GetSolution(const ExecutionContext& context, const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); - int num_wg = 0; - int work_per_wg = 0; - unsigned int bitmap = 0; - - GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); int num_wg_orig = num_wg; int max_num_wg = 4096; diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index c2aa07c8ae..05c1984941 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -50,28 +50,26 @@ bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context, auto asize = alens.size(); - int num_wg = 0; - int work_per_wg = 0; - unsigned int bitmap = 0; - - GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); - - // quick fix for btensor = <1, 1, 1, 1> - if(bTensorDesc.GetElementSize() == 1) - bitmap = 4; - - auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; - - if(GetDataType(aTensorDesc.GetType()) == "double") + if(aTensorDesc.GetType() == miopenDouble) { return false; } - if(asize == 4 && fwd_conv_bias != 0) + if(asize == 4) { - return true; - } + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); + // quick fix for btensor = <1, 1, 1, 1> + if(bTensorDesc.GetElementSize() == 1) + bitmap = 4; + + bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + + if(fwd_conv_bias != 0) + { + return true; + } + } return false; } @@ -92,16 +90,9 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context, const auto& cTensorDesc = problem.GetCTensorDesc(); int max_num_wg = 4096; - int num_wg_orig = 0; - int work_per_wg = 0; - int incr_wg = 0; - unsigned int bitmap = 0; - - size_t local_threads = 0; - size_t global_threads = 0; - Get4dParams( - problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); + auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = + Get4dParams(problem, false); const std::array vld{local_threads, 1, 1}; const std::array vgd{global_threads, 1, 1}; diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index 8f4a4399ce..3b99e0a8e5 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -50,41 +50,41 @@ bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context, auto asize = alens.size(); - int num_wg = 0; - int work_per_wg = 0; - unsigned int bitmap = 0; + if(aTensorDesc.GetType() == miopenDouble) + { + return false; + } - GetBitmapAndWgInfo(blens, clens, num_wg, work_per_wg, bitmap); + if(asize == 4) + { - // quick fix for btensor = <1, 1, 1, 1> - if(bTensorDesc.GetElementSize() == 1) - bitmap = 4; + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); - auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + // quick fix for btensor = <1, 1, 1, 1> + if(bTensorDesc.GetElementSize() == 1) + bitmap = 4; - bool packed_tensor = true; - packed_tensor &= aTensorDesc.IsPacked(); - packed_tensor &= bTensorDesc.IsPacked(); - packed_tensor &= cTensorDesc.IsPacked(); + bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; - bool packed_equal_tensor = - packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize()); + bool packed_tensor = true; + packed_tensor &= aTensorDesc.IsPacked(); + packed_tensor &= bTensorDesc.IsPacked(); + packed_tensor &= cTensorDesc.IsPacked(); - bool leading_ones = true; - // first_not_one is incorrect if btensor size equal to 1 - auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); - auto d = std::distance(blens.begin(), first_not_one.base()); + bool packed_equal_tensor = + packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize()); - IsBitmapLeadingOnes(bitmap, clens.size(), static_cast(d - 2), leading_ones); + // first_not_one is incorrect if btensor size equal to 1 + auto first_not_one = + std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); + auto d = std::distance(blens.begin(), first_not_one.base()); - if(GetDataType(aTensorDesc.GetType()) == "double") - { - return false; - } + bool leading_ones = IsBitmapLeadingOnes(bitmap, clens.size(), static_cast(d - 2)); - if(asize == 4 && fwd_conv_bias == 0 && !packed_equal_tensor && leading_ones) - { - return true; + if(fwd_conv_bias == 0 && !packed_equal_tensor && leading_ones) + { + return true; + } } return false; @@ -108,16 +108,9 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context, const auto& cTensorDesc = problem.GetCTensorDesc(); int max_num_wg = 4096; - int num_wg_orig = 0; - int work_per_wg = 0; - int incr_wg = 0; - unsigned int bitmap = 0; - - size_t local_threads = 0; - size_t global_threads = 0; - Get4dParams( - problem, false, num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); + auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = + Get4dParams(problem, false); const std::array vld{local_threads, 1, 1}; const std::array vgd{global_threads, 1, 1}; diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp index 02b1a137d0..2162ed9208 100644 --- a/src/solver/tensorOp/tensor_op_helpers.hpp +++ b/src/solver/tensorOp/tensor_op_helpers.hpp @@ -29,6 +29,8 @@ #include #include +#include + namespace miopen { namespace solver { @@ -36,7 +38,7 @@ namespace solver { namespace tensorOp { inline void GetCommonParams(KernelBuildParameters& build_params, - miopen::tensorOp::ProblemDescription problem, + const miopen::tensorOp::ProblemDescription& problem, bool is64bSupported) { build_params.Define("MIOPEN_TYPE", miopen::GetDataType(problem.GetBTensorDesc().GetType())); @@ -59,31 +61,29 @@ inline void GetCommonParams(KernelBuildParameters& build_params, } } -inline void -GetRDBLCKandREADTYPE(size_t len, miopenDataType_t type, size_t& RD_BLCK, std::string& READ_TYPE) +inline std::tuple GetRDBLCKandREADTYPE(size_t len, miopenDataType_t type) { - RD_BLCK = (len % 4 == 0) ? 4 : (len % 2 == 0) ? 2 : 1; const std::string data_type = GetDataType(type); - READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); + size_t RD_BLCK = (len % 4 == 0) ? 4 : (len % 2 == 0) ? 2 : 1; + return std::make_tuple(RD_BLCK, + (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK)); } -inline void GetBitmapAndWgInfo(const std::vector& blens, - const std::vector& clens, - int& num_wg, - int& work_per_wg, - unsigned int& bitmap) +inline std::tuple GetBitmapAndWgInfo(const std::vector& blens, + const std::vector& clens) { // first_not_one is incorrect if btensor size equal to 1 auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); auto d = std::distance(blens.begin(), first_not_one.base()); // quick fix - num_wg = first_not_one != blens.rend() - ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) - : 1; + int num_wg = first_not_one != blens.rend() + ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) + : 1; - work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); + int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); + unsigned int bitmap = 0; // update bitmap for first_not_one bitmap |= (1 << (blens.size() - d)); @@ -99,26 +99,23 @@ inline void GetBitmapAndWgInfo(const std::vector& blens, work_per_wg *= clens[i]; } } + + return std::make_tuple(num_wg, work_per_wg, bitmap); } -inline void -IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_one, bool& leading_ones) +inline bool IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_one) { + bool leading_ones = true; for(int i = first_not_one; i >= 0; i--) { bool is_one = (bitmap & (1 << (n_size - 1 - i))) != 0u; leading_ones &= is_one; } + return leading_ones; } -inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem, - bool is4dLite, - int& num_wg_orig, - int& work_per_wg, - int& incr_wg, - unsigned int& bitmap, - size_t& local_threads, - size_t& global_threads) +inline std::tuple Get4dParams(const miopen::tensorOp::ProblemDescription& problem, + bool is4dLite) { const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); @@ -137,8 +134,9 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem, ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) : 1; - work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); + int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); + unsigned int bitmap = 0; // update bitmap for first_not_one bitmap |= (1 << (blens.size() - d)); @@ -159,6 +157,7 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem, if(bTensorDesc.GetElementSize() == 1) bitmap = 4; + int incr_wg = 0; // Forward Convolution Bias specialization // for fwd-bias, bitmap looks like <0, 1, 0, 0> // Is the no. of work-groups and the work for each wg balanced? @@ -172,14 +171,13 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem, incr_wg = 1; } - num_wg_orig = num_wg; + int num_wg_orig = num_wg; int max_num_wg = 4096; num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; - local_threads = 256; + size_t local_threads = 256; - bool leading_ones = true; - IsBitmapLeadingOnes(bitmap, clens.size(), static_cast(d - 2), leading_ones); + bool leading_ones = IsBitmapLeadingOnes(bitmap, clens.size(), static_cast(d - 2)); if(leading_ones && work_per_wg < 64) { @@ -187,7 +185,7 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem, } // Special case for adding tensors in place - global_threads = + size_t global_threads = (static_cast(leading_ones) == 1 && (d - 1) == 3) ? num_wg : num_wg * local_threads; global_threads = (global_threads < local_threads) ? local_threads : global_threads; @@ -208,6 +206,8 @@ inline void Get4dParams(const miopen::tensorOp::ProblemDescription& problem, global_threads = glb_sz; } + + return std::make_tuple(num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); } } // namespace tensorOp diff --git a/src/tensorOp/problem_description.cpp b/src/tensorOp/problem_description.cpp index dc16276f05..4056fd3172 100644 --- a/src/tensorOp/problem_description.cpp +++ b/src/tensorOp/problem_description.cpp @@ -26,6 +26,7 @@ #include #include +#include namespace miopen { @@ -42,33 +43,27 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const auto bstrides = bTensorDesc.GetStrides(); auto cstrides = cTensorDesc.GetStrides(); - std::string alens_str{}; - std::string blens_str{}; - std::string astrides_str{}; - std::string bstrides_str{}; - std::string cstrides_str{}; - - for(uint32_t i = 0; i < alens.size(); i++) - { - alens_str += std::to_string(alens[i]); - blens_str += std::to_string(blens[i]); - astrides_str += std::to_string(astrides[i]); - bstrides_str += std::to_string(bstrides[i]); - cstrides_str += std::to_string(cstrides[i]); - - if(i != (alens.size() - 1)) + auto printDims = [&ss](const auto& dim) { + for(uint32_t i = 0; i < dim.size(); i++) { - alens_str += "x"; - blens_str += "x"; - astrides_str += "x"; - bstrides_str += "x"; - cstrides_str += "x"; + ss << dim[i]; + if(i != (dim.size() - 1)) + { + ss << "x"; + } } - } + ss << "-"; + }; + + ss << std::to_string(aTensorDesc.GetType()) << "-" << std::to_string(tensorOp) << "-"; + + printDims(alens); + printDims(blens); + printDims(astrides); + printDims(bstrides); + printDims(cstrides); - ss << std::to_string(aTensorDesc.GetType()) << "-" << std::to_string(tensorOp) << "-" - << alens_str << "-" << blens_str << "-" << astrides_str << "-" << bstrides_str << "-" - << cstrides_str << "-" << std::to_string((beta == 0)); + ss << (float_equal(beta, 0.0f) ? "1" : "0"); return NetworkConfig{ss.str()}; } From 6be98d056a5531313afc661174f643d9d908bc97 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Thu, 7 Nov 2024 16:10:46 +0200 Subject: [PATCH 10/25] clang format --- src/include/miopen/tensorOp/problem_description.hpp | 2 ++ src/solver/tensorOp/Op1dTensorGeneric.cpp | 6 +++--- src/solver/tensorOp/Op2dTensorGeneric.cpp | 6 +++--- src/solver/tensorOp/Op4dTensorGeneric.cpp | 6 +++--- src/solver/tensorOp/OpTensorFwdBias.cpp | 6 +++--- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 6 +++--- src/solver/tensorOp/tensor_op_helpers.hpp | 13 +++++++------ test/tensor_ops.cpp | 2 +- 8 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/include/miopen/tensorOp/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp index 81621cfcbe..515955f5be 100644 --- a/src/include/miopen/tensorOp/problem_description.hpp +++ b/src/include/miopen/tensorOp/problem_description.hpp @@ -53,6 +53,7 @@ struct ProblemDescription : ProblemDescriptionBase { MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr"); } + beta = *(static_cast(beta_)); if(aTensorDesc.GetElementSize() != cTensorDesc.GetElementSize()) @@ -67,6 +68,7 @@ struct ProblemDescription : ProblemDescriptionBase auto blens = bTensorDesc.GetLengths(); auto clens = cTensorDesc.GetLengths(); + if(clens.size() > 5) { MIOPEN_THROW("Tensor dimension larger than 5: " + std::to_string(clens.size())); diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index a149f488e2..581b22f68e 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -43,7 +43,7 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, { const auto& aTensorDesc = problem.GetATensorDesc(); const auto& alens = aTensorDesc.GetLengths(); - auto asize = alens.size(); + auto asize = alens.size(); if(aTensorDesc.GetType() == miopenDouble) { @@ -94,8 +94,8 @@ Op1dTensorGeneric::GetSolution(const ExecutionContext& context, auto kernel = KernelInfo{}; kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); - kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; - kernel.kernel_name = "Op1dTensorGeneric"; + kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; + kernel.kernel_name = "Op1dTensorGeneric"; using std::begin, std::end; diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index 3f4b03c2d1..23b7210094 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -42,7 +42,7 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context, { const auto& aTensorDesc = problem.GetATensorDesc(); const auto& alens = aTensorDesc.GetLengths(); - auto asize = alens.size(); + auto asize = alens.size(); if(aTensorDesc.GetType() == miopenDouble) { @@ -93,8 +93,8 @@ Op2dTensorGeneric::GetSolution(const ExecutionContext& context, auto kernel = KernelInfo{}; kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); - kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; - kernel.kernel_name = "Op2dTensorGeneric"; + kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; + kernel.kernel_name = "Op2dTensorGeneric"; using std::begin, std::end; diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp index ad17bf8791..0733981896 100644 --- a/src/solver/tensorOp/Op4dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp @@ -70,10 +70,10 @@ Op4dTensorGeneric::GetSolution(const ExecutionContext& context, { auto result = ConvSolution{miopenStatusSuccess}; - int max_num_wg = 4096; + int max_num_wg = 4096; - auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = Get4dParams( - problem, false); + auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = + Get4dParams(problem, false); const std::array vld{local_threads, 1, 1}; const std::array vgd{global_threads, 1, 1}; diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index 05c1984941..cec181fcf2 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -89,7 +89,7 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context, const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); - int max_num_wg = 4096; + int max_num_wg = 4096; auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = Get4dParams(problem, false); @@ -148,7 +148,7 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context, const auto& cstrides = params.cTensorDesc.GetStrides(); if(packed_tensor) - { + { // OpTensorFwdBias kernel(params.ATensor, params.BTensor, static_cast(blens[1]), @@ -167,7 +167,7 @@ ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context, static_cast(incr_wg)); } else - { + { // OpTensorFwdBiasGeneric kernel(params.ATensor, static_cast(astrides[0]), static_cast(astrides[1]), diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index 3b99e0a8e5..ad4e0f3116 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -107,7 +107,7 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context, const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); - int max_num_wg = 4096; + int max_num_wg = 4096; auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = Get4dParams(problem, false); @@ -164,7 +164,7 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context, const auto& cstrides = params.cTensorDesc.GetStrides(); if(packed_tensor) - { + { // OpTensorLeadingOnes kernel(params.ATensor, params.BTensor, params.CTensor, @@ -184,7 +184,7 @@ OpTensorLeadingOnes::GetSolution(const ExecutionContext& context, bitmap); } else - { + { // OpTensorLeadingOnesGeneric kernel(params.ATensor, static_cast(astrides[0]), static_cast(astrides[1]), diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp index 2162ed9208..46ce39e4a0 100644 --- a/src/solver/tensorOp/tensor_op_helpers.hpp +++ b/src/solver/tensorOp/tensor_op_helpers.hpp @@ -114,8 +114,8 @@ inline bool IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_o return leading_ones; } -inline std::tuple Get4dParams(const miopen::tensorOp::ProblemDescription& problem, - bool is4dLite) +inline std::tuple +Get4dParams(const miopen::tensorOp::ProblemDescription& problem, bool is4dLite) { const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); @@ -171,9 +171,9 @@ inline std::tuple Get4dParams(const incr_wg = 1; } - int num_wg_orig = num_wg; - int max_num_wg = 4096; - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + int num_wg_orig = num_wg; + int max_num_wg = 4096; + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; size_t local_threads = 256; @@ -207,7 +207,8 @@ inline std::tuple Get4dParams(const global_threads = glb_sz; } - return std::make_tuple(num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); + return std::make_tuple( + num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads); } } // namespace tensorOp diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp index ee42a94ea6..1df83044b2 100644 --- a/test/tensor_ops.cpp +++ b/test/tensor_ops.cpp @@ -258,7 +258,7 @@ struct tensor_ops_driver : test_driver {1, 20, 16, 1}, {1, 20, 1, 1}, {1, 1, 16, 8}, - {1, 1, 1, 8}, + {1, 1, 1, 8}, {20, 16, 8}, {20, 16, 1}, {1, 16, 8}, From d6ffea5c372c5ae536a33040a80a75e46b118739 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Thu, 7 Nov 2024 17:00:48 +0200 Subject: [PATCH 11/25] fwd_conv_bias changed --- src/solver/tensorOp/Op4dTensorLite.cpp | 4 ++-- src/solver/tensorOp/OpTensorFwdBias.cpp | 4 ++-- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp index 588a2c6ef9..6751cfc9fc 100644 --- a/src/solver/tensorOp/Op4dTensorLite.cpp +++ b/src/solver/tensorOp/Op4dTensorLite.cpp @@ -63,7 +63,7 @@ bool Op4dTensorLite::IsApplicable(const ExecutionContext& context, if(bTensorDesc.GetElementSize() == 1) bitmap = 4; - bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + bool fwd_conv_bias = (bitmap == (1 << 2)); bool packed_tensor = true; packed_tensor &= aTensorDesc.IsPacked(); @@ -73,7 +73,7 @@ bool Op4dTensorLite::IsApplicable(const ExecutionContext& context, bool packed_equal_tensor = packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize()); - if(fwd_conv_bias == 0 && packed_equal_tensor) + if(!fwd_conv_bias && packed_equal_tensor) { return true; } diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index cec181fcf2..8586b50034 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -63,9 +63,9 @@ bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context, if(bTensorDesc.GetElementSize() == 1) bitmap = 4; - bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + bool fwd_conv_bias = (bitmap == (1 << 2)); - if(fwd_conv_bias != 0) + if(fwd_conv_bias) { return true; } diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index ad4e0f3116..d8d09461ac 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -64,7 +64,7 @@ bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context, if(bTensorDesc.GetElementSize() == 1) bitmap = 4; - bool fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; + bool fwd_conv_bias = (bitmap == (1 << 2)); bool packed_tensor = true; packed_tensor &= aTensorDesc.IsPacked(); @@ -81,7 +81,7 @@ bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context, bool leading_ones = IsBitmapLeadingOnes(bitmap, clens.size(), static_cast(d - 2)); - if(fwd_conv_bias == 0 && !packed_equal_tensor && leading_ones) + if(!fwd_conv_bias && !packed_equal_tensor && leading_ones) { return true; } From 89dd24c8f5dd596ffec1b4c203b86210cf9048ba Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Fri, 8 Nov 2024 09:50:57 +0200 Subject: [PATCH 12/25] tidy some part of the code --- src/include/miopen/tensorOp/problem_description.hpp | 4 ++-- src/solver/tensorOp/Op1dTensorGeneric.cpp | 12 ++++++------ src/solver/tensorOp/Op2dTensorGeneric.cpp | 12 ++++++------ src/solver/tensorOp/Op2dTensorLite.cpp | 10 +++++----- src/solver/tensorOp/Op2dTensorSquash.cpp | 12 ++++++------ src/solver/tensorOp/Op3dTensorGeneric.cpp | 12 ++++++------ src/solver/tensorOp/Op4dTensorGeneric.cpp | 12 ++++++------ src/solver/tensorOp/Op4dTensorLite.cpp | 10 +++++----- src/solver/tensorOp/Op5dTensorGeneric.cpp | 8 ++++---- src/solver/tensorOp/OpTensorFwdBias.cpp | 10 +++++----- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 12 ++++++------ 11 files changed, 57 insertions(+), 57 deletions(-) diff --git a/src/include/miopen/tensorOp/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp index 515955f5be..8aa4529ee3 100644 --- a/src/include/miopen/tensorOp/problem_description.hpp +++ b/src/include/miopen/tensorOp/problem_description.hpp @@ -103,7 +103,7 @@ struct ProblemDescription : ProblemDescriptionBase } } - const miopenTensorOp_t GetTensorOp() const { return tensorOp; } + miopenTensorOp_t GetTensorOp() const { return tensorOp; } float GetBeta() const { return beta; } @@ -111,7 +111,7 @@ struct ProblemDescription : ProblemDescriptionBase const TensorDescriptor& GetBTensorDesc() const { return bTensorDesc; } const TensorDescriptor& GetCTensorDesc() const { return cTensorDesc; } - const bool GetNonStandardSquash() const { return nonStandardSquash; } + bool GetNonStandardSquash() const { return nonStandardSquash; } NetworkConfig MakeNetworkConfig() const override; diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index 581b22f68e..b66274ad58 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -38,7 +38,7 @@ namespace solver { namespace tensorOp { -bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, +bool Op1dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -58,16 +58,16 @@ bool Op1dTensorGeneric::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -Op1dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t Op1dTensorGeneric::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution -Op1dTensorGeneric::GetSolution(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const + Op1dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index 23b7210094..f910f63507 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context, +bool Op2dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -57,16 +57,16 @@ bool Op2dTensorGeneric::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -Op2dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t Op2dTensorGeneric::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution -Op2dTensorGeneric::GetSolution(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const + Op2dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp index d76c4f57f6..73f3659081 100644 --- a/src/solver/tensorOp/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -38,7 +38,7 @@ namespace solver { namespace tensorOp { -bool Op2dTensorLite::IsApplicable(const ExecutionContext& context, +bool Op2dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -81,14 +81,14 @@ bool Op2dTensorLite::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -Op2dTensorLite::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t Op2dTensorLite::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } -ConvSolution Op2dTensorLite::GetSolution(const ExecutionContext& context, +ConvSolution Op2dTensorLite::GetSolution([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp index 93f2868905..2f793a3fb7 100644 --- a/src/solver/tensorOp/Op2dTensorSquash.cpp +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context, +bool Op2dTensorSquash::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -73,16 +73,16 @@ bool Op2dTensorSquash::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -Op2dTensorSquash::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t Op2dTensorSquash::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution -Op2dTensorSquash::GetSolution(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const + Op2dTensorSquash::GetSolution([[maybe unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index 1aeb83509d..d10d096536 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op3dTensorGeneric::IsApplicable(const ExecutionContext& context, +bool Op3dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -57,16 +57,16 @@ bool Op3dTensorGeneric::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -Op3dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t Op3dTensorGeneric::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution -Op3dTensorGeneric::GetSolution(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const + Op3dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp index 0733981896..ed070dc684 100644 --- a/src/solver/tensorOp/Op4dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op4dTensorGeneric::IsApplicable(const ExecutionContext& context, +bool Op4dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -57,16 +57,16 @@ bool Op4dTensorGeneric::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -Op4dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t Op4dTensorGeneric::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution -Op4dTensorGeneric::GetSolution(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const + Op4dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp index 6751cfc9fc..41d6cdbe26 100644 --- a/src/solver/tensorOp/Op4dTensorLite.cpp +++ b/src/solver/tensorOp/Op4dTensorLite.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op4dTensorLite::IsApplicable(const ExecutionContext& context, +bool Op4dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -82,14 +82,14 @@ bool Op4dTensorLite::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -Op4dTensorLite::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t Op4dTensorLite::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } -ConvSolution Op4dTensorLite::GetSolution(const ExecutionContext& context, +ConvSolution Op4dTensorLite::GetSolution([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp index b0cb0397e0..e87246c384 100644 --- a/src/solver/tensorOp/Op5dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp @@ -38,7 +38,7 @@ namespace solver { namespace tensorOp { -bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context, +bool Op5dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -58,9 +58,9 @@ bool Op5dTensorGeneric::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -Op5dTensorGeneric::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t Op5dTensorGeneric::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index 8586b50034..a1433c1f7f 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context, +bool OpTensorFwdBias::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -73,14 +73,14 @@ bool OpTensorFwdBias::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -OpTensorFwdBias::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t OpTensorFwdBias::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } -ConvSolution OpTensorFwdBias::GetSolution(const ExecutionContext& context, +ConvSolution OpTensorFwdBias::GetSolution([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index d8d09461ac..4526c05329 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context, +bool OpTensorLeadingOnes::IsApplicable([[maybe unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -90,16 +90,16 @@ bool OpTensorLeadingOnes::IsApplicable(const ExecutionContext& context, return false; } -std::size_t -OpTensorLeadingOnes::GetWorkspaceSize(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +std::size_t OpTensorLeadingOnes::GetWorkspaceSize( + [[maybe unused]] const ExecutionContext& context, + [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution -OpTensorLeadingOnes::GetSolution(const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const + OpTensorLeadingOnes::GetSolution([[maybe unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; From 5a9b5edd1420faaaafdfdc2e4e43f9413e1d9459 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Fri, 8 Nov 2024 10:08:23 +0200 Subject: [PATCH 13/25] fix typos --- src/solver/tensorOp/Op1dTensorGeneric.cpp | 10 +++++----- src/solver/tensorOp/Op2dTensorGeneric.cpp | 10 +++++----- src/solver/tensorOp/Op2dTensorLite.cpp | 8 ++++---- src/solver/tensorOp/Op2dTensorSquash.cpp | 10 +++++----- src/solver/tensorOp/Op3dTensorGeneric.cpp | 10 +++++----- src/solver/tensorOp/Op4dTensorGeneric.cpp | 10 +++++----- src/solver/tensorOp/Op4dTensorLite.cpp | 8 ++++---- src/solver/tensorOp/Op5dTensorGeneric.cpp | 6 +++--- src/solver/tensorOp/OpTensorFwdBias.cpp | 8 ++++---- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 10 +++++----- 10 files changed, 45 insertions(+), 45 deletions(-) diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index b66274ad58..2b5338134b 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -38,7 +38,7 @@ namespace solver { namespace tensorOp { -bool Op1dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool Op1dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -59,15 +59,15 @@ bool Op1dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co } std::size_t Op1dTensorGeneric::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution - Op1dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index f910f63507..ac5658f144 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op2dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool Op2dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -58,15 +58,15 @@ bool Op2dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co } std::size_t Op2dTensorGeneric::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution - Op2dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp index 73f3659081..3eb8688cd1 100644 --- a/src/solver/tensorOp/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -38,7 +38,7 @@ namespace solver { namespace tensorOp { -bool Op2dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool Op2dTensorLite::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -82,13 +82,13 @@ bool Op2dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& conte } std::size_t Op2dTensorLite::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } -ConvSolution Op2dTensorLite::GetSolution([[maybe unused]] const ExecutionContext& context, +ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp index 2f793a3fb7..b5bee28bd5 100644 --- a/src/solver/tensorOp/Op2dTensorSquash.cpp +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op2dTensorSquash::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool Op2dTensorSquash::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -74,15 +74,15 @@ bool Op2dTensorSquash::IsApplicable([[maybe unused]] const ExecutionContext& con } std::size_t Op2dTensorSquash::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution - Op2dTensorSquash::GetSolution([[maybe unused]] const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index d10d096536..049fb6860f 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op3dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool Op3dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -58,15 +58,15 @@ bool Op3dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co } std::size_t Op3dTensorGeneric::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution - Op3dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp index ed070dc684..92f179f772 100644 --- a/src/solver/tensorOp/Op4dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op4dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool Op4dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -58,15 +58,15 @@ bool Op4dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co } std::size_t Op4dTensorGeneric::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution - Op4dTensorGeneric::GetSolution([[maybe unused]] const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp index 41d6cdbe26..96ca761063 100644 --- a/src/solver/tensorOp/Op4dTensorLite.cpp +++ b/src/solver/tensorOp/Op4dTensorLite.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool Op4dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool Op4dTensorLite::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -83,13 +83,13 @@ bool Op4dTensorLite::IsApplicable([[maybe unused]] const ExecutionContext& conte } std::size_t Op4dTensorLite::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } -ConvSolution Op4dTensorLite::GetSolution([[maybe unused]] const ExecutionContext& context, +ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp index e87246c384..63a7f5ddbc 100644 --- a/src/solver/tensorOp/Op5dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp @@ -38,7 +38,7 @@ namespace solver { namespace tensorOp { -bool Op5dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool Op5dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -59,8 +59,8 @@ bool Op5dTensorGeneric::IsApplicable([[maybe unused]] const ExecutionContext& co } std::size_t Op5dTensorGeneric::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index a1433c1f7f..09a595582b 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool OpTensorFwdBias::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool OpTensorFwdBias::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -74,13 +74,13 @@ bool OpTensorFwdBias::IsApplicable([[maybe unused]] const ExecutionContext& cont } std::size_t OpTensorFwdBias::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } -ConvSolution OpTensorFwdBias::GetSolution([[maybe unused]] const ExecutionContext& context, +ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index 4526c05329..11d33005b7 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -37,7 +37,7 @@ namespace solver { namespace tensorOp { -bool OpTensorLeadingOnes::IsApplicable([[maybe unused]] const ExecutionContext& context, +bool OpTensorLeadingOnes::IsApplicable([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { const auto& aTensorDesc = problem.GetATensorDesc(); @@ -91,15 +91,15 @@ bool OpTensorLeadingOnes::IsApplicable([[maybe unused]] const ExecutionContext& } std::size_t OpTensorLeadingOnes::GetWorkspaceSize( - [[maybe unused]] const ExecutionContext& context, - [[maybe unused]] const miopen::tensorOp::ProblemDescription& problem) const + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const { return 0; } ConvSolution - OpTensorLeadingOnes::GetSolution([[maybe unused]] const ExecutionContext& context, - const miopen::tensorOp::ProblemDescription& problem) const +OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; From c9f310aa852c74f2e59335956b36e5049ac10481 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Fri, 15 Nov 2024 19:37:03 +0200 Subject: [PATCH 14/25] implementnting suggestions, updating network_config and changes to potentially boost performance on host side --- src/include/miopen/names.hpp | 1 + src/include/miopen/tensorOp/invoke_params.hpp | 25 +--- .../miopen/tensorOp/problem_description.hpp | 24 ++-- src/solver/tensorOp/Op1dTensorGeneric.cpp | 53 ++++---- src/solver/tensorOp/Op2dTensorGeneric.cpp | 92 +++++++++----- src/solver/tensorOp/Op2dTensorLite.cpp | 33 +++-- src/solver/tensorOp/Op2dTensorSquash.cpp | 18 +-- src/solver/tensorOp/Op3dTensorGeneric.cpp | 29 +++-- src/solver/tensorOp/Op4dTensorGeneric.cpp | 40 ++++-- src/solver/tensorOp/Op4dTensorLite.cpp | 6 +- src/solver/tensorOp/Op5dTensorGeneric.cpp | 114 ++++++++++-------- src/solver/tensorOp/OpTensorFwdBias.cpp | 37 ++++-- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 33 +++-- src/tensor.cpp | 16 +-- src/tensorOp/problem_description.cpp | 37 +++--- 15 files changed, 328 insertions(+), 230 deletions(-) diff --git a/src/include/miopen/names.hpp b/src/include/miopen/names.hpp index 17b96b8732..bdf59c361c 100644 --- a/src/include/miopen/names.hpp +++ b/src/include/miopen/names.hpp @@ -34,6 +34,7 @@ struct NetworkConfig { NetworkConfig() = default; explicit NetworkConfig(const std::string& value_) : value(value_) {} + explicit NetworkConfig(std::string&& value_) noexcept : value(std::move(value_)) {} operator std::string() const { return value; } const std::string& ToString() const { return value; } diff --git a/src/include/miopen/tensorOp/invoke_params.hpp b/src/include/miopen/tensorOp/invoke_params.hpp index 99ff13da47..6b8f2ca88c 100644 --- a/src/include/miopen/tensorOp/invoke_params.hpp +++ b/src/include/miopen/tensorOp/invoke_params.hpp @@ -35,34 +35,24 @@ namespace tensorOp { struct InvokeParams : public miopen::InvokeParams { - InvokeParams(miopenTensorOp_t tensorOp_, - const void* alpha0_, - const TensorDescriptor& aTensorDesc_, + InvokeParams(const void* alpha0_, ConstData_t ATensor_, const void* alpha1_, - const TensorDescriptor& bTensorDesc_, ConstData_t BTensor_, const void* beta_, - const TensorDescriptor& cTensorDesc_, Data_t CTensor_, const size_t Aoffset_, const size_t Boffset_, - const size_t Coffset_, - const bool nonStandardSquash_) + const size_t Coffset_) : alpha0(alpha0_), alpha1(alpha1_), beta(beta_), - tensorOperation(tensorOp_), - aTensorDesc(aTensorDesc_), ATensor(ATensor_), - bTensorDesc(bTensorDesc_), BTensor(BTensor_), - cTensorDesc(cTensorDesc_), CTensor(CTensor_), Aoffset(Aoffset_), Boffset(Boffset_), - Coffset(Coffset_), - nonStandardSquash(nonStandardSquash_) + Coffset(Coffset_) { } @@ -74,22 +64,13 @@ struct InvokeParams : public miopen::InvokeParams const void* alpha1; const void* beta; - miopenTensorOp_t tensorOperation; - - TensorDescriptor aTensorDesc; ConstData_t ATensor; - - TensorDescriptor bTensorDesc; ConstData_t BTensor; - - TensorDescriptor cTensorDesc; Data_t CTensor; size_t Aoffset; size_t Boffset; size_t Coffset; - - bool nonStandardSquash; }; } // namespace tensorOp diff --git a/src/include/miopen/tensorOp/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp index 8aa4529ee3..dc60a3c7c9 100644 --- a/src/include/miopen/tensorOp/problem_description.hpp +++ b/src/include/miopen/tensorOp/problem_description.hpp @@ -66,8 +66,8 @@ struct ProblemDescription : ProblemDescriptionBase MIOPEN_THROW("Datatypes for B and C tensors do not match !"); } - auto blens = bTensorDesc.GetLengths(); - auto clens = cTensorDesc.GetLengths(); + const auto& blens = bTensorDesc.GetLengths(); + const auto& clens = cTensorDesc.GetLengths(); if(clens.size() > 5) { @@ -82,14 +82,12 @@ struct ProblemDescription : ProblemDescriptionBase if(!nonStandardSquash) { - for(std::size_t i = 0; i < clens.size(); i++) - { - if(blens[i] != 1 && blens[i] != clens[i]) - { - MIOPEN_THROW("BTensor dim != 1 && BTensor dim != CTensor dim: " + - std::to_string(i)); - } - } + constexpr auto comparator = [](size_t c, size_t b) { return b == 1 || b == c; }; + const auto [c_diff, b_diff] = + std::mismatch(clens.begin(), clens.end(), blens.begin(), comparator); + if(c_diff != clens.end()) + MIOPEN_THROW("BTensor dim != 1 && BTensor dim != CTensor dim:" + + std::to_string(std::distance(clens.begin(), c_diff))); } else { @@ -120,9 +118,9 @@ struct ProblemDescription : ProblemDescriptionBase float beta; - const TensorDescriptor& aTensorDesc; - const TensorDescriptor& bTensorDesc; - const TensorDescriptor& cTensorDesc; + TensorDescriptor aTensorDesc; + TensorDescriptor bTensorDesc; + TensorDescriptor cTensorDesc; const bool nonStandardSquash; }; diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index 2b5338134b..7a2662e60b 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -71,14 +71,24 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, { auto result = ConvSolution{miopenStatusSuccess}; + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); - const auto& clens = cTensorDesc.GetLengths(); + const size_t b_n = bTensorDesc.GetLengths()[0]; + const size_t c_n = cTensorDesc.GetLengths()[0]; + + const size_t a_nstrides = aTensorDesc.GetStrides()[0]; + const size_t b_nstrides = bTensorDesc.GetStrides()[0]; + const size_t c_nstrides = cTensorDesc.GetStrides()[0]; + + miopenDataType_t data_type = bTensorDesc.GetType(); + bool fit_into_int = aTensorDesc.AllDimsFitIntoInt(); size_t local_threads = 256; size_t max_num_wg = 4096; - auto num_wg = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg)); + auto num_wg = std::clamp(c_n / local_threads, size_t(1), size_t(max_num_wg)); num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; size_t global_threads = num_wg * local_threads; @@ -102,38 +112,33 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [](const std::vector kernels) { + result + .invoker_factory = [data_type, fit_into_int, b_n, c_n, a_nstrides, b_nstrides, c_nstrides]( + const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + visit_float(data_type, [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - const auto& blens = params.bTensorDesc.GetLengths(); - const auto& clens = params.cTensorDesc.GetLengths(); - - const auto& astrides = params.aTensorDesc.GetStrides(); - const auto& bstrides = params.bTensorDesc.GetStrides(); - const auto& cstrides = params.cTensorDesc.GetStrides(); - - if(params.aTensorDesc.AllDimsFitIntoInt()) - { // change offsets to 64bit after PR is merged + if(fit_into_int) + { kernel(params.ATensor, params.BTensor, params.CTensor, static_cast(params.Aoffset), static_cast(params.Boffset), static_cast(params.Coffset), - static_cast(astrides[0]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(cstrides[0]), + static_cast(a_nstrides), + static_cast(b_n == 1 ? 0 : b_nstrides), + static_cast(c_nstrides), miopen_alpha0, miopen_alpha1, miopen_beta, - static_cast(clens[0]), + static_cast(c_n), !float_equal(miopen_beta, 0.0)); } else @@ -141,16 +146,16 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel(params.ATensor, params.BTensor, params.CTensor, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(astrides[0]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(cstrides[0]), + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(a_nstrides), + static_cast(b_n == 1 ? 0 : b_nstrides), + static_cast(c_nstrides), miopen_alpha0, miopen_alpha1, miopen_beta, - static_cast(clens[0]), + static_cast(c_n), !float_equal(miopen_beta, 0.0)); } }); diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index ac5658f144..917eee17c2 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -70,9 +70,24 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, { auto result = ConvSolution{miopenStatusSuccess}; + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); - const auto& clens = cTensorDesc.GetLengths(); + std::array blens; + std::array clens; + std::tie(blens[0], blens[1]) = miopen::tien<2>(bTensorDesc.GetLengths()); + std::tie(clens[0], clens[1]) = miopen::tien<2>(cTensorDesc.GetLengths()); + + std::array astrides; + std::array bstrides; + std::array cstrides; + std::tie(astrides[0], astrides[1]) = miopen::tien<2>(aTensorDesc.GetStrides()); + std::tie(bstrides[0], bstrides[1]) = miopen::tien<2>(bTensorDesc.GetStrides()); + std::tie(cstrides[0], cstrides[1]) = miopen::tien<2>(cTensorDesc.GetStrides()); + + miopenDataType_t data_type = bTensorDesc.GetType(); + bool fit_into_int = aTensorDesc.AllDimsFitIntoInt(); size_t local_threads = 32; size_t max_num_wg = 4096; @@ -101,42 +116,61 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [](const std::vector kernels) { + result.invoker_factory = [data_type, fit_into_int, blens, clens, astrides, bstrides, cstrides]( + const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + visit_float(data_type, [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - const auto& blens = params.bTensorDesc.GetLengths(); - const auto& clens = params.cTensorDesc.GetLengths(); - - const auto& astrides = params.aTensorDesc.GetStrides(); - const auto& bstrides = params.bTensorDesc.GetStrides(); - const auto& cstrides = params.cTensorDesc.GetStrides(); - - kernel(params.ATensor, - params.BTensor, - params.CTensor, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(blens[1] == 1 ? clens[1] : blens[1]), - static_cast(clens[1]), - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(blens[1] == 1 ? 0 : bstrides[1]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); + if(fit_into_int) + { + kernel(params.ATensor, + params.BTensor, + params.CTensor, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(blens[1] == 1 ? clens[1] : blens[1]), + static_cast(clens[1]), + static_cast(astrides[0]), + static_cast(astrides[1]), + static_cast(blens[0] == 1 ? 0 : bstrides[0]), + static_cast(blens[1] == 1 ? 0 : bstrides[1]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(clens[0]), + !float_equal(miopen_beta, 0.0)); + } + else + { + kernel(params.ATensor, + params.BTensor, + params.CTensor, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(blens[1] == 1 ? clens[1] : blens[1]), + static_cast(clens[1]), + static_cast(astrides[0]), + static_cast(astrides[1]), + static_cast(blens[0] == 1 ? 0 : bstrides[0]), + static_cast(blens[1] == 1 ? 0 : bstrides[1]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(clens[0]), + !float_equal(miopen_beta, 0.0)); + } }); }; }; diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp index 3eb8688cd1..696d73b073 100644 --- a/src/solver/tensorOp/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -93,12 +93,19 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext { auto result = ConvSolution{miopenStatusSuccess}; + const auto& aTensorDesc = problem.GetATensorDesc(); const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); + const size_t a_cstride = aTensorDesc.GetStrides()[1]; + const size_t b_cstride = bTensorDesc.GetStrides()[1]; + const size_t c_cstride = cTensorDesc.GetStrides()[1]; + + miopenDataType_t data_type = bTensorDesc.GetType(); + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); int max_num_wg = 4096; @@ -107,7 +114,7 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext size_t local_threads = 256; // for naive tensor ops - auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType()); + auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], data_type); size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); size_t grp_sz = (total_work + local_threads - 1) / local_threads; @@ -143,28 +150,28 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [total_work, total_work2](const std::vector kernels) { + result.invoker_factory = [data_type, + b_c = blens[1], + a_cstride, + b_cstride, + c_cstride, + total_work, + total_work2](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + visit_float(data_type, [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - const auto& blens = params.bTensorDesc.GetLengths(); - - const auto& astrides = params.aTensorDesc.GetStrides(); - const auto& bstrides = params.bTensorDesc.GetStrides(); - const auto& cstrides = params.cTensorDesc.GetStrides(); - kernel(params.ATensor, - static_cast(astrides[1]), + static_cast(a_cstride), params.BTensor, - static_cast(bstrides[1]), + static_cast(b_cstride), params.CTensor, - static_cast(cstrides[1]), + static_cast(c_cstride), miopen_alpha0, miopen_alpha1, miopen_beta, @@ -174,7 +181,7 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext static_cast(total_work), static_cast(total_work2), static_cast(!float_equal(miopen_beta, 0.0)), - static_cast(blens[1] == 1)); + static_cast(b_c == 1)); }); }; }; diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp index b5bee28bd5..0f400eab2d 100644 --- a/src/solver/tensorOp/Op2dTensorSquash.cpp +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -92,6 +92,10 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context, const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); + const size_t b_nstride = bTensorDesc.GetStrides()[1]; + + miopenDataType_t data_type = bTensorDesc.GetType(); + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); int max_num_wg = 4096; @@ -100,7 +104,7 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context, size_t local_threads = 256; // for naive tensor ops - auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], bTensorDesc.GetType()); + auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], data_type); size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); size_t grp_sz = (total_work + local_threads - 1) / local_threads; @@ -130,23 +134,21 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [total_work](const std::vector kernels) { + result.invoker_factory = [data_type, b_c = blens[1], b_nstride, total_work]( + const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + visit_float(data_type, [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - const auto& blens = params.bTensorDesc.GetLengths(); - const auto& bstrides = params.bTensorDesc.GetStrides(); - kernel(params.ATensor, params.BTensor, - static_cast(blens[1]), - static_cast(bstrides[1]), + static_cast(b_c), + static_cast(b_nstride), params.CTensor, miopen_alpha0, miopen_alpha1, diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index 049fb6860f..f4cdb191cf 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -70,12 +70,22 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, { auto result = ConvSolution{miopenStatusSuccess}; + const auto& aTensorDesc = problem.GetATensorDesc(); const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); + std::array astrides; + std::array bstrides; + std::array cstrides; + std::tie(astrides[0], astrides[1], astrides[2]) = miopen::tien<3>(aTensorDesc.GetStrides()); + std::tie(bstrides[0], bstrides[1], bstrides[2]) = miopen::tien<3>(bTensorDesc.GetStrides()); + std::tie(cstrides[0], cstrides[1], cstrides[2]) = miopen::tien<3>(cTensorDesc.GetStrides()); + + miopenDataType_t data_type = bTensorDesc.GetType(); + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); int num_wg_orig = num_wg; @@ -106,23 +116,24 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [bitmap, work_per_wg, num_wg_orig](const std::vector kernels) { + result.invoker_factory = [data_type, + blens, + clens, + astrides, + bstrides, + cstrides, + bitmap, + work_per_wg, + num_wg_orig](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + visit_float(data_type, [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - const auto& blens = params.bTensorDesc.GetLengths(); - const auto& clens = params.cTensorDesc.GetLengths(); - - const auto& astrides = params.aTensorDesc.GetStrides(); - const auto& bstrides = params.bTensorDesc.GetStrides(); - const auto& cstrides = params.cTensorDesc.GetStrides(); - kernel(params.ATensor, static_cast(astrides[0]), static_cast(astrides[1]), diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp index 92f179f772..93b9b9847b 100644 --- a/src/solver/tensorOp/Op4dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp @@ -70,6 +70,27 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, { auto result = ConvSolution{miopenStatusSuccess}; + const auto& aTensorDesc = problem.GetATensorDesc(); + const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); + + std::array blens; + std::array clens; + std::tie(blens[0], blens[1], blens[2], blens[3]) = miopen::tien<4>(bTensorDesc.GetLengths()); + std::tie(clens[0], clens[1], clens[2], clens[3]) = miopen::tien<4>(cTensorDesc.GetLengths()); + + std::array astrides; + std::array bstrides; + std::array cstrides; + std::tie(astrides[0], astrides[1], astrides[2], astrides[3]) = + miopen::tien<4>(aTensorDesc.GetStrides()); + std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3]) = + miopen::tien<4>(bTensorDesc.GetStrides()); + std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3]) = + miopen::tien<4>(cTensorDesc.GetStrides()); + + miopenDataType_t data_type = bTensorDesc.GetType(); + int max_num_wg = 4096; auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = @@ -95,23 +116,24 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [work_per_wg, num_wg_orig, bitmap](const std::vector kernels) { + result.invoker_factory = [data_type, + blens, + clens, + astrides, + bstrides, + cstrides, + work_per_wg, + num_wg_orig, + bitmap](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + visit_float(data_type, [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - const auto& blens = params.bTensorDesc.GetLengths(); - const auto& clens = params.cTensorDesc.GetLengths(); - - const auto& astrides = params.aTensorDesc.GetStrides(); - const auto& bstrides = params.bTensorDesc.GetStrides(); - const auto& cstrides = params.cTensorDesc.GetStrides(); - kernel(params.ATensor, static_cast(astrides[0]), // a_nstride, static_cast(astrides[1]), // a_cstride, diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp index 96ca761063..a53174507e 100644 --- a/src/solver/tensorOp/Op4dTensorLite.cpp +++ b/src/solver/tensorOp/Op4dTensorLite.cpp @@ -97,6 +97,8 @@ ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); + miopenDataType_t data_type = bTensorDesc.GetType(); + auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = Get4dParams(problem, true); @@ -127,12 +129,12 @@ ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [total_work](const std::vector kernels) { + result.invoker_factory = [data_type, total_work](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + visit_float(data_type, [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp index 63a7f5ddbc..35ef705f5b 100644 --- a/src/solver/tensorOp/Op5dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp @@ -66,17 +66,30 @@ std::size_t Op5dTensorGeneric::GetWorkspaceSize( } ConvSolution -Op5dTensorGeneric::GetSolution(const ExecutionContext& context, +Op5dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, const miopen::tensorOp::ProblemDescription& problem) const { auto result = ConvSolution{miopenStatusSuccess}; - const auto& cTensorDesc = problem.GetCTensorDesc(); + const auto& aTensorDesc = problem.GetATensorDesc(); const auto& bTensorDesc = problem.GetBTensorDesc(); + const auto& cTensorDesc = problem.GetCTensorDesc(); const auto& blens = bTensorDesc.GetLengths(); const auto& clens = cTensorDesc.GetLengths(); + std::array astrides; + std::array bstrides; + std::array cstrides; + std::tie(astrides[0], astrides[1], astrides[2], astrides[3], astrides[4]) = + miopen::tien<5>(aTensorDesc.GetStrides()); + std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3], bstrides[4]) = + miopen::tien<5>(bTensorDesc.GetStrides()); + std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3], cstrides[4]) = + miopen::tien<5>(cTensorDesc.GetStrides()); + + miopenDataType_t data_type = bTensorDesc.GetType(); + auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); int num_wg_orig = num_wg; @@ -107,58 +120,53 @@ Op5dTensorGeneric::GetSolution(const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [bitmap, work_per_wg, num_wg_orig](const std::vector kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); - auto miopen_beta = as_float(*(static_cast(params.beta))); - - const auto& blens = params.bTensorDesc.GetLengths(); - const auto& clens = params.cTensorDesc.GetLengths(); - - const auto& astrides = params.aTensorDesc.GetStrides(); - const auto& bstrides = params.bTensorDesc.GetStrides(); - const auto& cstrides = params.cTensorDesc.GetStrides(); - - kernel(params.ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(astrides[2]), - static_cast(astrides[3]), - params.BTensor, - static_cast(blens[1]), // b_c, - static_cast(blens[2]), // b_d, - static_cast(blens[3]), // b_h, - static_cast(blens[4]), // b_w, - static_cast(bstrides[0]), // b_nstride, - static_cast(bstrides[1]), // b_cstride, - static_cast(bstrides[2]), // b_dstride, - static_cast(bstrides[3]), // b_hstride, - params.CTensor, - static_cast(clens[1]), // c_c, - static_cast(clens[2]), // c_d, - static_cast(clens[3]), // c_h, - static_cast(clens[4]), // c_w, - static_cast(cstrides[0]), // c_nstride, - static_cast(cstrides[1]), // c_cstride, - static_cast(cstrides[2]), // c_dstride, - static_cast(cstrides[3]), // c_hstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - bitmap, - work_per_wg, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(num_wg_orig)); - }); + result.invoker_factory = + [data_type, blens, clens, astrides, bstrides, cstrides, bitmap, work_per_wg, num_wg_orig]( + const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(data_type, [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + kernel(params.ATensor, + static_cast(astrides[0]), + static_cast(astrides[1]), + static_cast(astrides[2]), + static_cast(astrides[3]), + params.BTensor, + static_cast(blens[1]), // b_c, + static_cast(blens[2]), // b_d, + static_cast(blens[3]), // b_h, + static_cast(blens[4]), // b_w, + static_cast(bstrides[0]), // b_nstride, + static_cast(bstrides[1]), // b_cstride, + static_cast(bstrides[2]), // b_dstride, + static_cast(bstrides[3]), // b_hstride, + params.CTensor, + static_cast(clens[1]), // c_c, + static_cast(clens[2]), // c_d, + static_cast(clens[3]), // c_h, + static_cast(clens[4]), // c_w, + static_cast(cstrides[0]), // c_nstride, + static_cast(cstrides[1]), // c_cstride, + static_cast(cstrides[2]), // c_dstride, + static_cast(cstrides[3]), // c_hstride, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + bitmap, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig)); + }); + }; }; - }; result.construction_params.push_back(kernel); return result; diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index 09a595582b..16e5d19d70 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -89,6 +89,23 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); + std::array blens; + std::array clens; + std::tie(blens[0], blens[1], blens[2], blens[3]) = miopen::tien<4>(bTensorDesc.GetLengths()); + std::tie(clens[0], clens[1], clens[2], clens[3]) = miopen::tien<4>(cTensorDesc.GetLengths()); + + std::array astrides; + std::array bstrides; + std::array cstrides; + std::tie(astrides[0], astrides[1], astrides[2], astrides[3]) = + miopen::tien<4>(aTensorDesc.GetStrides()); + std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3]) = + miopen::tien<4>(bTensorDesc.GetStrides()); + std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3]) = + miopen::tien<4>(cTensorDesc.GetStrides()); + + miopenDataType_t data_type = bTensorDesc.GetType(); + int max_num_wg = 4096; auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = @@ -130,23 +147,25 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); result.invoker_factory = - [work_per_wg, num_wg_orig, incr_wg, packed_tensor](const std::vector kernels) { + [data_type, + blens, + clens, + astrides, + bstrides, + cstrides, + work_per_wg, + num_wg_orig, + incr_wg, + packed_tensor](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + visit_float(data_type, [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - const auto& blens = params.bTensorDesc.GetLengths(); - const auto& clens = params.cTensorDesc.GetLengths(); - - const auto& astrides = params.aTensorDesc.GetStrides(); - const auto& bstrides = params.bTensorDesc.GetStrides(); - const auto& cstrides = params.cTensorDesc.GetStrides(); - if(packed_tensor) { // OpTensorFwdBias kernel(params.ATensor, diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index 11d33005b7..9792b21093 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -107,6 +107,21 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex const auto& bTensorDesc = problem.GetBTensorDesc(); const auto& cTensorDesc = problem.GetCTensorDesc(); + std::array clens; + std::tie(clens[0], clens[1], clens[2], clens[3]) = miopen::tien<4>(cTensorDesc.GetLengths()); + + std::array astrides; + std::array bstrides; + std::array cstrides; + std::tie(astrides[0], astrides[1], astrides[2], astrides[3]) = + miopen::tien<4>(aTensorDesc.GetStrides()); + std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3]) = + miopen::tien<4>(bTensorDesc.GetStrides()); + std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3]) = + miopen::tien<4>(cTensorDesc.GetStrides()); + + miopenDataType_t data_type = bTensorDesc.GetType(); + int max_num_wg = 4096; auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] = @@ -147,22 +162,24 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); result.invoker_factory = - [work_per_wg, num_wg_orig, bitmap, packed_tensor](const std::vector kernels) { + [data_type, + clens, + astrides, + bstrides, + cstrides, + work_per_wg, + num_wg_orig, + bitmap, + packed_tensor](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - visit_float(params.bTensorDesc.GetType(), [&](auto as_float) { + visit_float(data_type, [&](auto as_float) { auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); auto miopen_beta = as_float(*(static_cast(params.beta))); - const auto& clens = params.cTensorDesc.GetLengths(); - - const auto& astrides = params.aTensorDesc.GetStrides(); - const auto& bstrides = params.bTensorDesc.GetStrides(); - const auto& cstrides = params.cTensorDesc.GetStrides(); - if(packed_tensor) { // OpTensorLeadingOnes kernel(params.ATensor, diff --git a/src/tensor.cpp b/src/tensor.cpp index f65a1a408e..c1bd709267 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -906,20 +906,8 @@ void OpTensor2(Handle& handle, const auto problem = tensorOp::ProblemDescription{ tensorOp, beta, aTensorDesc, bTensorDesc, cTensorDesc, nonStandardSquash}; - const auto invoke_params = tensorOp::InvokeParams{tensorOp, - alpha0, - aTensorDesc, - ATensor, - alpha1, - bTensorDesc, - BTensor, - beta, - cTensorDesc, - CTensor, - Aoffset, - Boffset, - Coffset, - nonStandardSquash}; + const auto invoke_params = tensorOp::InvokeParams{ + alpha0, ATensor, alpha1, BTensor, beta, CTensor, Aoffset, Boffset, Coffset}; const auto algo = AlgorithmName{"TensorOpSolver"}; const auto solvers = solver::SolverContainer{} + diff --git a/src/tensorOp/problem_description.cpp b/src/tensorOp/problem_description.cpp index 4056fd3172..6053e7f1a0 100644 --- a/src/tensorOp/problem_description.cpp +++ b/src/tensorOp/problem_description.cpp @@ -34,28 +34,31 @@ namespace tensorOp { NetworkConfig ProblemDescription::MakeNetworkConfig() const { - std::ostringstream ss; + std::string ss; - auto alens = aTensorDesc.GetLengths(); - auto blens = bTensorDesc.GetLengths(); + const auto& alens = aTensorDesc.GetLengths(); + const auto& blens = bTensorDesc.GetLengths(); - auto astrides = aTensorDesc.GetStrides(); - auto bstrides = bTensorDesc.GetStrides(); - auto cstrides = cTensorDesc.GetStrides(); + const auto& astrides = aTensorDesc.GetStrides(); + const auto& bstrides = bTensorDesc.GetStrides(); + const auto& cstrides = cTensorDesc.GetStrides(); - auto printDims = [&ss](const auto& dim) { - for(uint32_t i = 0; i < dim.size(); i++) + auto printDims = [&ss, dims = alens.size() - 1](const auto& dim) { + for(uint32_t i = 0; i < dims; i++) { - ss << dim[i]; - if(i != (dim.size() - 1)) - { - ss << "x"; - } + ss.append(std::to_string(dim[i])); + ss += 'x'; } - ss << "-"; + ss += std::to_string(dim.back()); + ss += '-'; }; - ss << std::to_string(aTensorDesc.GetType()) << "-" << std::to_string(tensorOp) << "-"; + ss.reserve(1024); + ss.append(std::string_view("TensorOp-")); + ss += std::to_string(aTensorDesc.GetType()); + ss += '-'; + ss += std::to_string(tensorOp); + ss += '-'; printDims(alens); printDims(blens); @@ -63,9 +66,9 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const printDims(bstrides); printDims(cstrides); - ss << (float_equal(beta, 0.0f) ? "1" : "0"); + ss += (float_equal(beta, 0.0f) ? '1' : '0'); - return NetworkConfig{ss.str()}; + return NetworkConfig(std::move(ss)); } } // namespace tensorOp From 496b414712ff764b8abfc73896b7b1e1b3835848 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Fri, 15 Nov 2024 19:38:35 +0200 Subject: [PATCH 15/25] clang format --- src/solver/tensorOp/Op1dTensorGeneric.cpp | 96 +++++++------- src/solver/tensorOp/Op2dTensorLite.cpp | 64 +++++----- src/solver/tensorOp/Op2dTensorSquash.cpp | 56 ++++---- src/solver/tensorOp/Op3dTensorGeneric.cpp | 78 ++++++----- src/solver/tensorOp/Op4dTensorGeneric.cpp | 88 ++++++------- src/solver/tensorOp/OpTensorFwdBias.cpp | 131 ++++++++++--------- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 135 ++++++++++---------- 7 files changed, 315 insertions(+), 333 deletions(-) diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index 7a2662e60b..c15b02ced7 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -112,55 +112,55 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result - .invoker_factory = [data_type, fit_into_int, b_n, c_n, a_nstrides, b_nstrides, c_nstrides]( - const std::vector kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - visit_float(data_type, [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); - auto miopen_beta = as_float(*(static_cast(params.beta))); - - if(fit_into_int) - { - kernel(params.ATensor, - params.BTensor, - params.CTensor, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(a_nstrides), - static_cast(b_n == 1 ? 0 : b_nstrides), - static_cast(c_nstrides), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(c_n), - !float_equal(miopen_beta, 0.0)); - } - else - { - kernel(params.ATensor, - params.BTensor, - params.CTensor, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(a_nstrides), - static_cast(b_n == 1 ? 0 : b_nstrides), - static_cast(c_nstrides), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(c_n), - !float_equal(miopen_beta, 0.0)); - } - }); + result.invoker_factory = + [data_type, fit_into_int, b_n, c_n, a_nstrides, b_nstrides, c_nstrides]( + const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(data_type, [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + if(fit_into_int) + { + kernel(params.ATensor, + params.BTensor, + params.CTensor, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(a_nstrides), + static_cast(b_n == 1 ? 0 : b_nstrides), + static_cast(c_nstrides), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(c_n), + !float_equal(miopen_beta, 0.0)); + } + else + { + kernel(params.ATensor, + params.BTensor, + params.CTensor, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(a_nstrides), + static_cast(b_n == 1 ? 0 : b_nstrides), + static_cast(c_nstrides), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(c_n), + !float_equal(miopen_beta, 0.0)); + } + }); + }; }; - }; result.construction_params.push_back(kernel); return result; diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp index 696d73b073..2b7b030a2f 100644 --- a/src/solver/tensorOp/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -150,41 +150,37 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [data_type, - b_c = blens[1], - a_cstride, - b_cstride, - c_cstride, - total_work, - total_work2](const std::vector kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - visit_float(data_type, [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); - auto miopen_beta = as_float(*(static_cast(params.beta))); - - kernel(params.ATensor, - static_cast(a_cstride), - params.BTensor, - static_cast(b_cstride), - params.CTensor, - static_cast(c_cstride), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(total_work), - static_cast(total_work2), - static_cast(!float_equal(miopen_beta, 0.0)), - static_cast(b_c == 1)); - }); + result.invoker_factory = + [data_type, b_c = blens[1], a_cstride, b_cstride, c_cstride, total_work, total_work2]( + const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(data_type, [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + kernel(params.ATensor, + static_cast(a_cstride), + params.BTensor, + static_cast(b_cstride), + params.CTensor, + static_cast(c_cstride), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(total_work), + static_cast(total_work2), + static_cast(!float_equal(miopen_beta, 0.0)), + static_cast(b_c == 1)); + }); + }; }; - }; result.construction_params.push_back(kernel); return result; diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp index 0f400eab2d..d6ca7cfa3b 100644 --- a/src/solver/tensorOp/Op2dTensorSquash.cpp +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -134,35 +134,35 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [data_type, b_c = blens[1], b_nstride, total_work]( - const std::vector kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - visit_float(data_type, [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); - auto miopen_beta = as_float(*(static_cast(params.beta))); - - kernel(params.ATensor, - params.BTensor, - static_cast(b_c), - static_cast(b_nstride), - params.CTensor, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(total_work), - static_cast(!float_equal(miopen_alpha0, 0.0)), - static_cast(!float_equal(miopen_alpha1, 0.0)), - static_cast(!float_equal(miopen_beta, 0.0))); - }); + result.invoker_factory = + [data_type, b_c = blens[1], b_nstride, total_work](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(data_type, [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + kernel(params.ATensor, + params.BTensor, + static_cast(b_c), + static_cast(b_nstride), + params.CTensor, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(total_work), + static_cast(!float_equal(miopen_alpha0, 0.0)), + static_cast(!float_equal(miopen_alpha1, 0.0)), + static_cast(!float_equal(miopen_beta, 0.0))); + }); + }; }; - }; result.construction_params.push_back(kernel); return result; diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index f4cdb191cf..c03aec3f33 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -116,49 +116,43 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [data_type, - blens, - clens, - astrides, - bstrides, - cstrides, - bitmap, - work_per_wg, - num_wg_orig](const std::vector kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - visit_float(data_type, [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); - auto miopen_beta = as_float(*(static_cast(params.beta))); - - kernel(params.ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - params.BTensor, - static_cast(blens[1]), - static_cast(blens[2]), - static_cast(bstrides[0]), - static_cast(bstrides[1]), - params.CTensor, - static_cast(clens[1]), - static_cast(clens[2]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - bitmap, - work_per_wg, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(num_wg_orig)); - }); + result.invoker_factory = + [data_type, blens, clens, astrides, bstrides, cstrides, bitmap, work_per_wg, num_wg_orig]( + const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(data_type, [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + kernel(params.ATensor, + static_cast(astrides[0]), + static_cast(astrides[1]), + params.BTensor, + static_cast(blens[1]), + static_cast(blens[2]), + static_cast(bstrides[0]), + static_cast(bstrides[1]), + params.CTensor, + static_cast(clens[1]), + static_cast(clens[2]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + bitmap, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig)); + }); + }; }; - }; result.construction_params.push_back(kernel); return result; diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp index 93b9b9847b..3c67a3411f 100644 --- a/src/solver/tensorOp/Op4dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp @@ -116,54 +116,48 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = [data_type, - blens, - clens, - astrides, - bstrides, - cstrides, - work_per_wg, - num_wg_orig, - bitmap](const std::vector kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - visit_float(data_type, [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); - auto miopen_beta = as_float(*(static_cast(params.beta))); - - kernel(params.ATensor, - static_cast(astrides[0]), // a_nstride, - static_cast(astrides[1]), // a_cstride, - static_cast(astrides[2]), // a_hstride, - params.BTensor, - static_cast(blens[1]), // b_c, - static_cast(blens[2]), // b_h, - static_cast(blens[3]), // b_w, - static_cast(bstrides[0]), // b_nstride, - static_cast(bstrides[1]), // b_cstride, - static_cast(bstrides[2]), // b_hstride, - params.CTensor, - static_cast(clens[1]), // c_c, - static_cast(clens[2]), // c_h, - static_cast(clens[3]), // c_w, - static_cast(cstrides[0]), // c_nstride, - static_cast(cstrides[1]), // c_cstride, - static_cast(cstrides[2]), // c_hstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - bitmap, - work_per_wg, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(num_wg_orig)); - }); + result.invoker_factory = + [data_type, blens, clens, astrides, bstrides, cstrides, work_per_wg, num_wg_orig, bitmap]( + const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(data_type, [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + kernel(params.ATensor, + static_cast(astrides[0]), // a_nstride, + static_cast(astrides[1]), // a_cstride, + static_cast(astrides[2]), // a_hstride, + params.BTensor, + static_cast(blens[1]), // b_c, + static_cast(blens[2]), // b_h, + static_cast(blens[3]), // b_w, + static_cast(bstrides[0]), // b_nstride, + static_cast(bstrides[1]), // b_cstride, + static_cast(bstrides[2]), // b_hstride, + params.CTensor, + static_cast(clens[1]), // c_c, + static_cast(clens[2]), // c_h, + static_cast(clens[3]), // c_w, + static_cast(cstrides[0]), // c_nstride, + static_cast(cstrides[1]), // c_cstride, + static_cast(cstrides[2]), // c_hstride, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + bitmap, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig)); + }); + }; }; - }; result.construction_params.push_back(kernel); return result; diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index 16e5d19d70..9df036df8c 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -146,73 +146,72 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = - [data_type, - blens, - clens, - astrides, - bstrides, - cstrides, - work_per_wg, - num_wg_orig, - incr_wg, - packed_tensor](const std::vector kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - visit_float(data_type, [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); - auto miopen_beta = as_float(*(static_cast(params.beta))); - - if(packed_tensor) - { // OpTensorFwdBias - kernel(params.ATensor, - params.BTensor, - static_cast(blens[1]), - params.CTensor, - static_cast(clens[0]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - work_per_wg, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(num_wg_orig), - static_cast(incr_wg)); - } - else - { // OpTensorFwdBiasGeneric - kernel(params.ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(astrides[2]), - params.BTensor, - static_cast(blens[1]), - static_cast(bstrides[1]), - params.CTensor, - static_cast(clens[0]), - static_cast(clens[3]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - static_cast(cstrides[2]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - work_per_wg, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(num_wg_orig), - static_cast(incr_wg)); - } - }); - }; + result.invoker_factory = [data_type, + blens, + clens, + astrides, + bstrides, + cstrides, + work_per_wg, + num_wg_orig, + incr_wg, + packed_tensor](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(data_type, [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + if(packed_tensor) + { // OpTensorFwdBias + kernel(params.ATensor, + params.BTensor, + static_cast(blens[1]), + params.CTensor, + static_cast(clens[0]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + work_per_wg, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig), + static_cast(incr_wg)); + } + else + { // OpTensorFwdBiasGeneric + kernel(params.ATensor, + static_cast(astrides[0]), + static_cast(astrides[1]), + static_cast(astrides[2]), + params.BTensor, + static_cast(blens[1]), + static_cast(bstrides[1]), + params.CTensor, + static_cast(clens[0]), + static_cast(clens[3]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + static_cast(cstrides[2]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig), + static_cast(incr_wg)); + } + }); }; + }; result.construction_params.push_back(kernel); return result; diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index 9792b21093..d930da0da6 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -161,75 +161,74 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld)); kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); - result.invoker_factory = - [data_type, - clens, - astrides, - bstrides, - cstrides, - work_per_wg, - num_wg_orig, - bitmap, - packed_tensor](const std::vector kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - visit_float(data_type, [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); - auto miopen_beta = as_float(*(static_cast(params.beta))); - - if(packed_tensor) - { // OpTensorLeadingOnes - kernel(params.ATensor, - params.BTensor, - params.CTensor, - static_cast(clens[1]), - static_cast(clens[2]), - static_cast(clens[3]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - work_per_wg, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(num_wg_orig), - bitmap); - } - else - { // OpTensorLeadingOnesGeneric - kernel(params.ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(astrides[2]), - params.BTensor, - static_cast(bstrides[0]), - static_cast(bstrides[1]), - static_cast(bstrides[2]), - params.CTensor, - static_cast(clens[1]), - static_cast(clens[2]), - static_cast(clens[3]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - static_cast(cstrides[2]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - work_per_wg, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(num_wg_orig), - bitmap); - } - }); - }; + result.invoker_factory = [data_type, + clens, + astrides, + bstrides, + cstrides, + work_per_wg, + num_wg_orig, + bitmap, + packed_tensor](const std::vector kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + visit_float(data_type, [&](auto as_float) { + auto miopen_alpha0 = as_float(*(static_cast(params.alpha0))); + auto miopen_alpha1 = as_float(*(static_cast(params.alpha1))); + auto miopen_beta = as_float(*(static_cast(params.beta))); + + if(packed_tensor) + { // OpTensorLeadingOnes + kernel(params.ATensor, + params.BTensor, + params.CTensor, + static_cast(clens[1]), + static_cast(clens[2]), + static_cast(clens[3]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + work_per_wg, + miopen_alpha0, + miopen_alpha1, + miopen_beta, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig), + bitmap); + } + else + { // OpTensorLeadingOnesGeneric + kernel(params.ATensor, + static_cast(astrides[0]), + static_cast(astrides[1]), + static_cast(astrides[2]), + params.BTensor, + static_cast(bstrides[0]), + static_cast(bstrides[1]), + static_cast(bstrides[2]), + params.CTensor, + static_cast(clens[1]), + static_cast(clens[2]), + static_cast(clens[3]), + static_cast(cstrides[0]), + static_cast(cstrides[1]), + static_cast(cstrides[2]), + miopen_alpha0, + miopen_alpha1, + miopen_beta, + work_per_wg, + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(num_wg_orig), + bitmap); + } + }); }; + }; result.construction_params.push_back(kernel); return result; From cb6fd6e956ca32bbd14331bff72b95a074dab626 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Mon, 18 Nov 2024 10:18:27 +0200 Subject: [PATCH 16/25] change for new Op3dTensorGeneric kernel usage --- src/solver/tensorOp/Op1dTensorGeneric.cpp | 6 ++-- src/solver/tensorOp/Op2dTensorGeneric.cpp | 2 +- src/solver/tensorOp/Op3dTensorGeneric.cpp | 42 +++++++++++------------ 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index c15b02ced7..896d75d50c 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -129,9 +129,9 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel(params.ATensor, params.BTensor, params.CTensor, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), static_cast(a_nstrides), static_cast(b_n == 1 ? 0 : b_nstrides), static_cast(c_nstrides), diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index 917eee17c2..41fca78068 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -101,7 +101,7 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false); + GetCommonParams(build_params, problem, true); build_params.Define("USE_2D_TENSOR_GENERIC"); diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index c03aec3f33..c2c7212646 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -103,12 +103,11 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, GetCommonParams(build_params, problem, false); build_params.Define("USE_3D_TENSOR_GENERIC"); - build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); auto kernel = KernelInfo{}; - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); - kernel.kernel_file = "MIOpenTensorKernels.cl"; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + kernel.kernel_file = "MIOpenTensorKernelsHip.cpp"; kernel.kernel_name = "Op3dTensorGeneric"; using std::begin, std::end; @@ -117,8 +116,7 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd)); result.invoker_factory = - [data_type, blens, clens, astrides, bstrides, cstrides, bitmap, work_per_wg, num_wg_orig]( - const std::vector kernels) { + [data_type, blens, clens, astrides, bstrides, cstrides](const std::vector kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); @@ -129,27 +127,29 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, auto miopen_beta = as_float(*(static_cast(params.beta))); kernel(params.ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), params.BTensor, - static_cast(blens[1]), - static_cast(blens[2]), - static_cast(bstrides[0]), - static_cast(bstrides[1]), params.CTensor, - static_cast(clens[1]), - static_cast(clens[2]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), + static_cast(params.Aoffset), + static_cast(params.Boffset), + static_cast(params.Coffset), + static_cast(blens[1] == 1 ? clens[1] : blens[1]), // b_c, + static_cast(blens[2] == 1 ? clens[2] : blens[2]), // b_h, + static_cast(clens[1]), // c_c, + static_cast(clens[2]), // c_h, + static_cast(astrides[0]), // a_nstride, + static_cast(astrides[1]), // a_cstride, + static_cast(astrides[2]), // a_hstride, + static_cast(blens[0] == 1 ? 0 : bstrides[0]), // b_nstride, + static_cast(blens[1] == 1 ? 0 : bstrides[1]), // b_cstride, + static_cast(blens[2] == 1 ? 0 : bstrides[2]), // b_hstride, + static_cast(cstrides[0]), // c_nstride, + static_cast(cstrides[1]), // c_cstride, + static_cast(cstrides[2]), // c_hstride, miopen_alpha0, miopen_alpha1, miopen_beta, - bitmap, - work_per_wg, - static_cast(params.Aoffset), - static_cast(params.Boffset), - static_cast(params.Coffset), - static_cast(num_wg_orig)); + static_cast(clens[0]), + !float_equal(miopen_beta, 0.0)); }); }; }; From 6c3d0c2d7e87bb98e5023b1d78ab686ad2eebaf4 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Mon, 18 Nov 2024 15:34:53 +0200 Subject: [PATCH 17/25] remove unused variable --- src/solver/tensorOp/Op3dTensorGeneric.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index c2c7212646..3bef77fa29 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -88,7 +88,6 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); - int num_wg_orig = num_wg; int max_num_wg = 4096; num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; From bd0bd61c78ecf7a351f20486c9ec4f217f85e7a0 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Mon, 18 Nov 2024 15:36:19 +0200 Subject: [PATCH 18/25] clang format --- src/solver/tensorOp/Op3dTensorGeneric.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index 3bef77fa29..2bafc6abaa 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -88,8 +88,8 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); - int max_num_wg = 4096; - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + int max_num_wg = 4096; + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; size_t local_threads = 256; size_t global_threads = num_wg * local_threads; From 3f14d3aa03fefc471a7498ba1992757bd08cc0d2 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Tue, 19 Nov 2024 10:00:35 +0200 Subject: [PATCH 19/25] support for half data type for CL kernels --- src/kernels/MIOpenTensorKernels.cl | 17 +++++++ src/solver/tensorOp/Op1dTensorGeneric.cpp | 2 +- src/solver/tensorOp/Op2dTensorGeneric.cpp | 2 +- src/solver/tensorOp/Op2dTensorLite.cpp | 2 +- src/solver/tensorOp/Op2dTensorSquash.cpp | 2 +- src/solver/tensorOp/Op3dTensorGeneric.cpp | 2 +- src/solver/tensorOp/Op4dTensorGeneric.cpp | 2 +- src/solver/tensorOp/Op4dTensorLite.cpp | 2 +- src/solver/tensorOp/Op5dTensorGeneric.cpp | 2 +- src/solver/tensorOp/OpTensorFwdBias.cpp | 2 +- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 2 +- src/solver/tensorOp/tensor_op_helpers.hpp | 50 ++++++++++++++++++++- 12 files changed, 76 insertions(+), 11 deletions(-) diff --git a/src/kernels/MIOpenTensorKernels.cl b/src/kernels/MIOpenTensorKernels.cl index 842d3d4d6b..fce43b78f3 100644 --- a/src/kernels/MIOpenTensorKernels.cl +++ b/src/kernels/MIOpenTensorKernels.cl @@ -23,6 +23,23 @@ * SOFTWARE. * *******************************************************************************/ +#if MIOPEN_USE_FP16 == 1 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define _FLOAT half +#ifndef HALF_MAX +#define MAX_VAL 65504 /* max value */ +#else +#define MAX_VAL HALF_MAX +#endif +#endif +#if MIOPEN_USE_FP32 == 1 +#define _FLOAT float +#ifndef FLT_MAX +#define MAX_VAL 3.402823466e+38F /* max value */ +#else +#define MAX_VAL FLT_MAX +#endif +#endif /* Only works for NCHW * bitmap tracks which dims are the same between 'a' and 'c'. diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index 896d75d50c..9abf49b912 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -97,7 +97,7 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, true); + GetCommonParams(build_params, problem, false, true); build_params.Define("USE_1D_TENSOR_GENERIC"); diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index 41fca78068..03045d69ae 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -101,7 +101,7 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, true); + GetCommonParams(build_params, problem, false, true); build_params.Define("USE_2D_TENSOR_GENERIC"); diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp index 2b7b030a2f..e070354bfe 100644 --- a/src/solver/tensorOp/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -133,7 +133,7 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false); + GetCommonParams(build_params, problem, true, false); build_params.Define("USE_2D_TENSOR_LITE"); build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp index d6ca7cfa3b..40876e9d3c 100644 --- a/src/solver/tensorOp/Op2dTensorSquash.cpp +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -117,7 +117,7 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false); + GetCommonParams(build_params, problem, true, false); build_params.Define("USE_2D_TENSOR_SQUASH"); build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index 2bafc6abaa..c8a5f9dd1a 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -99,7 +99,7 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false); + GetCommonParams(build_params, problem, false, false); build_params.Define("USE_3D_TENSOR_GENERIC"); diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp index 3c67a3411f..c28a094741 100644 --- a/src/solver/tensorOp/Op4dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp @@ -101,7 +101,7 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false); + GetCommonParams(build_params, problem, true, false); build_params.Define("USE_4D_TENSOR_GENERIC"); build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp index a53174507e..fa9d45f0e5 100644 --- a/src/solver/tensorOp/Op4dTensorLite.cpp +++ b/src/solver/tensorOp/Op4dTensorLite.cpp @@ -112,7 +112,7 @@ ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false); + GetCommonParams(build_params, problem, true, false); build_params.Define("USE_4D_TENSOR_LITE"); build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp index 35ef705f5b..bcf6d66773 100644 --- a/src/solver/tensorOp/Op5dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp @@ -104,7 +104,7 @@ Op5dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false); + GetCommonParams(build_params, problem, true, false); build_params.Define("USE_5D_TENSOR_GENERIC"); build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index 9df036df8c..6d8d8139f1 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -121,7 +121,7 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false); + GetCommonParams(build_params, problem, true, false); build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index d930da0da6..37daada086 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -137,7 +137,7 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false); + GetCommonParams(build_params, problem, true, false); build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); auto kernel = KernelInfo{}; diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp index 46ce39e4a0..d902914368 100644 --- a/src/solver/tensorOp/tensor_op_helpers.hpp +++ b/src/solver/tensorOp/tensor_op_helpers.hpp @@ -39,9 +39,57 @@ namespace tensorOp { inline void GetCommonParams(KernelBuildParameters& build_params, const miopen::tensorOp::ProblemDescription& problem, + bool isCLKernel, bool is64bSupported) { - build_params.Define("MIOPEN_TYPE", miopen::GetDataType(problem.GetBTensorDesc().GetType())); + miopenDataType_t data_type = problem.GetBTensorDesc().GetType(); + + if(isCLKernel) + { // values for MIOPEN_USE_ macros + int use_fp16 = 0; + int use_fp16x4 = 0; + int use_fp16x8 = 0; + int use_fp32 = 0; + int use_int8 = 0; + int use_int32 = 0; + int use_bfp16 = 0; + int use_fp64 = 0; + int use_fp8 = 0; + int use_bfp8 = 0; + const int use_rne_bfloat16 = MIOPEN_USE_RNE_BFLOAT16; + + switch(data_type) + { + case miopenHalf: use_fp16 = 1; break; + case miopenFloat: use_fp32 = 1; break; + case miopenInt8: use_int8 = 1; break; + case miopenBFloat16: use_bfp16 = 1; break; + case miopenInt32: use_int32 = 1; break; + case miopenDouble: use_fp64 = 1; break; + case miopenFloat8: use_fp8 = 1; break; + case miopenBFloat8: use_bfp8 = 1; break; + default: MIOPEN_THROW("Unsupported data type."); break; + } + + build_params.Define("MIOPEN_USE_FP16", use_fp16); + build_params.Define("MIOPEN_USE_FP16x4", use_fp16x4); + build_params.Define("MIOPEN_USE_FP16x8", use_fp16x8); + build_params.Define("MIOPEN_USE_FP32", use_fp32); + build_params.Define("MIOPEN_USE_INT8", use_int8); + build_params.Define("MIOPEN_USE_BFP16", use_bfp16); + build_params.Define("MIOPEN_USE_INT32", use_int32); + build_params.Define("MIOPEN_USE_RNE_BFLOAT16", use_rne_bfloat16); + build_params.Define("MIOPEN_FP8_IEEE_EXPONENT_BIAS", MIOPEN_FP8_IEEE_EXPONENT_BIAS); + build_params.Define("MIOPEN_FP8_CLIPPING", MIOPEN_FP8_CLIPPING); + if(use_fp64 != 0) + build_params.Define("MIOPEN_USE_FP64", use_fp64); + if(use_fp8 != 0) + build_params.Define("MIOPEN_USE_FP8", use_fp8); + if(use_bfp8 != 0) + build_params.Define("MIOPEN_USE_BFP8", use_bfp8); + } + + build_params.Define("MIOPEN_TYPE", miopen::GetDataType(data_type)); switch(problem.GetTensorOp()) { From 042129eb3270929d2d0c402119510c47081918ae Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Tue, 19 Nov 2024 15:12:24 +0200 Subject: [PATCH 20/25] additional changes for support for half type --- src/kernels/MIOpenTensorKernels.cl | 16 ------- src/solver/tensorOp/Op1dTensorGeneric.cpp | 2 +- src/solver/tensorOp/Op2dTensorGeneric.cpp | 2 +- src/solver/tensorOp/Op2dTensorLite.cpp | 2 +- src/solver/tensorOp/Op2dTensorSquash.cpp | 2 +- src/solver/tensorOp/Op3dTensorGeneric.cpp | 2 +- src/solver/tensorOp/Op4dTensorGeneric.cpp | 2 +- src/solver/tensorOp/Op4dTensorLite.cpp | 2 +- src/solver/tensorOp/Op5dTensorGeneric.cpp | 2 +- src/solver/tensorOp/OpTensorFwdBias.cpp | 2 +- src/solver/tensorOp/OpTensorLeadingOnes.cpp | 2 +- src/solver/tensorOp/tensor_op_helpers.hpp | 46 --------------------- 12 files changed, 10 insertions(+), 72 deletions(-) diff --git a/src/kernels/MIOpenTensorKernels.cl b/src/kernels/MIOpenTensorKernels.cl index fce43b78f3..8203dad1f0 100644 --- a/src/kernels/MIOpenTensorKernels.cl +++ b/src/kernels/MIOpenTensorKernels.cl @@ -23,23 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -#if MIOPEN_USE_FP16 == 1 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define _FLOAT half -#ifndef HALF_MAX -#define MAX_VAL 65504 /* max value */ -#else -#define MAX_VAL HALF_MAX -#endif -#endif -#if MIOPEN_USE_FP32 == 1 -#define _FLOAT float -#ifndef FLT_MAX -#define MAX_VAL 3.402823466e+38F /* max value */ -#else -#define MAX_VAL FLT_MAX -#endif -#endif /* Only works for NCHW * bitmap tracks which dims are the same between 'a' and 'c'. diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index 9abf49b912..896d75d50c 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -97,7 +97,7 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false, true); + GetCommonParams(build_params, problem, true); build_params.Define("USE_1D_TENSOR_GENERIC"); diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index 03045d69ae..41fca78068 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -101,7 +101,7 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false, true); + GetCommonParams(build_params, problem, true); build_params.Define("USE_2D_TENSOR_GENERIC"); diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp index e070354bfe..2b7b030a2f 100644 --- a/src/solver/tensorOp/Op2dTensorLite.cpp +++ b/src/solver/tensorOp/Op2dTensorLite.cpp @@ -133,7 +133,7 @@ ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, true, false); + GetCommonParams(build_params, problem, false); build_params.Define("USE_2D_TENSOR_LITE"); build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp index 40876e9d3c..d6ca7cfa3b 100644 --- a/src/solver/tensorOp/Op2dTensorSquash.cpp +++ b/src/solver/tensorOp/Op2dTensorSquash.cpp @@ -117,7 +117,7 @@ Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, true, false); + GetCommonParams(build_params, problem, false); build_params.Define("USE_2D_TENSOR_SQUASH"); build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index c8a5f9dd1a..2bafc6abaa 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -99,7 +99,7 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, false, false); + GetCommonParams(build_params, problem, false); build_params.Define("USE_3D_TENSOR_GENERIC"); diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp index c28a094741..3c67a3411f 100644 --- a/src/solver/tensorOp/Op4dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp @@ -101,7 +101,7 @@ Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, true, false); + GetCommonParams(build_params, problem, false); build_params.Define("USE_4D_TENSOR_GENERIC"); build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp index fa9d45f0e5..a53174507e 100644 --- a/src/solver/tensorOp/Op4dTensorLite.cpp +++ b/src/solver/tensorOp/Op4dTensorLite.cpp @@ -112,7 +112,7 @@ ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, true, false); + GetCommonParams(build_params, problem, false); build_params.Define("USE_4D_TENSOR_LITE"); build_params.Define("RD_BLCK", std::to_string(RD_BLCK)); diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp index bcf6d66773..35ef705f5b 100644 --- a/src/solver/tensorOp/Op5dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp @@ -104,7 +104,7 @@ Op5dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, true, false); + GetCommonParams(build_params, problem, false); build_params.Define("USE_5D_TENSOR_GENERIC"); build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp index 6d8d8139f1..9df036df8c 100644 --- a/src/solver/tensorOp/OpTensorFwdBias.cpp +++ b/src/solver/tensorOp/OpTensorFwdBias.cpp @@ -121,7 +121,7 @@ ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContex KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, true, false); + GetCommonParams(build_params, problem, false); build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp index 37daada086..d930da0da6 100644 --- a/src/solver/tensorOp/OpTensorLeadingOnes.cpp +++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp @@ -137,7 +137,7 @@ OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& contex KernelBuildParameters build_params = KernelBuildParameters{}; - GetCommonParams(build_params, problem, true, false); + GetCommonParams(build_params, problem, false); build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg)); auto kernel = KernelInfo{}; diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp index d902914368..26a9ac42d0 100644 --- a/src/solver/tensorOp/tensor_op_helpers.hpp +++ b/src/solver/tensorOp/tensor_op_helpers.hpp @@ -39,56 +39,10 @@ namespace tensorOp { inline void GetCommonParams(KernelBuildParameters& build_params, const miopen::tensorOp::ProblemDescription& problem, - bool isCLKernel, bool is64bSupported) { miopenDataType_t data_type = problem.GetBTensorDesc().GetType(); - if(isCLKernel) - { // values for MIOPEN_USE_ macros - int use_fp16 = 0; - int use_fp16x4 = 0; - int use_fp16x8 = 0; - int use_fp32 = 0; - int use_int8 = 0; - int use_int32 = 0; - int use_bfp16 = 0; - int use_fp64 = 0; - int use_fp8 = 0; - int use_bfp8 = 0; - const int use_rne_bfloat16 = MIOPEN_USE_RNE_BFLOAT16; - - switch(data_type) - { - case miopenHalf: use_fp16 = 1; break; - case miopenFloat: use_fp32 = 1; break; - case miopenInt8: use_int8 = 1; break; - case miopenBFloat16: use_bfp16 = 1; break; - case miopenInt32: use_int32 = 1; break; - case miopenDouble: use_fp64 = 1; break; - case miopenFloat8: use_fp8 = 1; break; - case miopenBFloat8: use_bfp8 = 1; break; - default: MIOPEN_THROW("Unsupported data type."); break; - } - - build_params.Define("MIOPEN_USE_FP16", use_fp16); - build_params.Define("MIOPEN_USE_FP16x4", use_fp16x4); - build_params.Define("MIOPEN_USE_FP16x8", use_fp16x8); - build_params.Define("MIOPEN_USE_FP32", use_fp32); - build_params.Define("MIOPEN_USE_INT8", use_int8); - build_params.Define("MIOPEN_USE_BFP16", use_bfp16); - build_params.Define("MIOPEN_USE_INT32", use_int32); - build_params.Define("MIOPEN_USE_RNE_BFLOAT16", use_rne_bfloat16); - build_params.Define("MIOPEN_FP8_IEEE_EXPONENT_BIAS", MIOPEN_FP8_IEEE_EXPONENT_BIAS); - build_params.Define("MIOPEN_FP8_CLIPPING", MIOPEN_FP8_CLIPPING); - if(use_fp64 != 0) - build_params.Define("MIOPEN_USE_FP64", use_fp64); - if(use_fp8 != 0) - build_params.Define("MIOPEN_USE_FP8", use_fp8); - if(use_bfp8 != 0) - build_params.Define("MIOPEN_USE_BFP8", use_bfp8); - } - build_params.Define("MIOPEN_TYPE", miopen::GetDataType(data_type)); switch(problem.GetTensorOp()) From 371d43c4781535ea71b75ee0f9981b8576f35754 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Wed, 20 Nov 2024 18:29:56 +0200 Subject: [PATCH 21/25] initial removal of tensorocl.cpp --- src/CMakeLists.txt | 1 - src/include/miopen/rnn/solvers.hpp | 23 +- src/include/miopen/tensor_ops.hpp | 18 +- src/ocl/tensorocl.cpp | 2617 ----------------- src/rnn/Solutions/Base/bw_data_modular.cpp | 4 +- src/rnn/Solutions/Base/bw_weights_modular.cpp | 4 +- src/rnn/Solutions/Base/fw_data_modular.cpp | 7 +- src/rnn/Solutions/bwd_multi_stream.cpp | 2 +- src/rnn/Solutions/bww_multi_stream.cpp | 2 +- src/rnn/Solutions/bww_s_steam.cpp | 2 +- src/tensor.cpp | 1178 +++++++- test/tensor_ops.cpp | 36 +- 12 files changed, 1202 insertions(+), 2692 deletions(-) delete mode 100644 src/ocl/tensorocl.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4f1096001a..0721efc4f3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -696,7 +696,6 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN ocl/lrn_ocl.cpp ocl/mloNorm.cpp ocl/pooling_ocl.cpp - ocl/tensorocl.cpp ocl/rnnocl.cpp ocl/utilocl.cpp ocl/ctcocl.cpp diff --git a/src/include/miopen/rnn/solvers.hpp b/src/include/miopen/rnn/solvers.hpp index 429bcee752..908c3def65 100644 --- a/src/include/miopen/rnn/solvers.hpp +++ b/src/include/miopen/rnn/solvers.hpp @@ -171,9 +171,9 @@ class RNNForwardDataModularAlgo : RNNModuleAlgoBase // base API void PrepareWriteBuffers(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const; - void PropX(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const; + void PropX(Handle& handle, const runtimeArgsFwd& runtimeArgs) const; - void AddBias(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const; + void AddBias(Handle& handle, const runtimeArgsFwd& runtimeArgs) const; void PropHxCx(const Handle& handle, const runtimeArgsFwd& runtimeArgs, unsigned int layer, @@ -206,7 +206,7 @@ class RNNForwardDataModularAlgo : RNNModuleAlgoBase void PropY(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const; // ext API - void PropX(const Handle& handle, + void PropX(Handle& handle, const runtimeArgsFwd& runtimeArgs, size_t gemm_batch_offset, size_t gemm_batch_size) const; @@ -340,7 +340,7 @@ class RNNBackwardDataModularAlgo : RNNModuleAlgoBase public: void PrepareWriteBuffers(const Handle& handle, Data_t dhx, Data_t dcx, Data_t workSpace) const; - void PropDhy(const Handle& handle, + void PropDhy(Handle& handle, ConstData_t dhy, Data_t workSpace, unsigned int layer, @@ -364,7 +364,7 @@ class RNNBackwardDataModularAlgo : RNNModuleAlgoBase const SequenceIterator& seq, SequenceDirection direction) const; - void PropDhxDcx(const Handle& handle, + void PropDhxDcx(Handle& handle, ConstData_t w, Data_t dhx, Data_t dcx, @@ -625,7 +625,7 @@ class RNNModularMultiStreamBWD struct runtimeArgsBwd { - const Handle* handle; + Handle* handle; ConstData_t dy; ConstData_t dhy; Data_t dhx; @@ -728,11 +728,8 @@ class RNNBackwardWeightsModularAlgo ConstData_t reserveSpace, size_t layer) const; - void BiasUpdate(const Handle& handle, - Data_t dw, - Data_t workSpace, - size_t layer, - size_t workSpaceSize) const; + void BiasUpdate( + Handle& handle, Data_t dw, Data_t workSpace, size_t layer, size_t workSpaceSize) const; void HiddenHStateWeights(const Handle& handle, Data_t dw, @@ -1027,7 +1024,7 @@ class RNNModularSingleStreamBWWeights // TODO static size_t GetWsSize() { return 0; }; - void Compute(const Handle& handle, + void Compute(Handle& handle, ConstData_t x, ConstData_t hx, Data_t dw, @@ -1076,7 +1073,7 @@ class RNNModularMultiStreamBWWeights ConstData_t reserveSpace; }; - void Compute(const Handle& handle, + void Compute(Handle& handle, ConstData_t x, ConstData_t hx, Data_t dw, diff --git a/src/include/miopen/tensor_ops.hpp b/src/include/miopen/tensor_ops.hpp index c19eb333f2..a344eb9dbc 100644 --- a/src/include/miopen/tensor_ops.hpp +++ b/src/include/miopen/tensor_ops.hpp @@ -173,7 +173,7 @@ MIOPEN_INTERNALS_EXPORT void SetTensor(const Handle& handle, const void* alpha, int offset = 0); -MIOPEN_INTERNALS_EXPORT void OpTensor(const Handle& handle, +MIOPEN_INTERNALS_EXPORT void OpTensor(Handle& handle, miopenTensorOp_t tensorOp, const void* alpha0, const TensorDescriptor& aTensorDesc, @@ -189,22 +189,6 @@ MIOPEN_INTERNALS_EXPORT void OpTensor(const Handle& handle, size_t Coffset = 0, bool nonStandardSquash = false); -MIOPEN_INTERNALS_EXPORT void OpTensor2(Handle& handle, - miopenTensorOp_t tensorOp, - const void* alpha0, - const TensorDescriptor& aTensorDesc, - ConstData_t ATensor, - const void* alpha1, - const TensorDescriptor& bTensorDesc, - ConstData_t BTensor, - const void* beta, - const TensorDescriptor& cTensorDesc, - Data_t CTensor, - size_t Aoffset = 0, - size_t Boffset = 0, - size_t Coffset = 0, - bool nonStandardSquash = false); - MIOPEN_INTERNALS_EXPORT void CopyTensor(const Handle& handle, const TensorDescriptor& srcDesc, ConstData_t src, diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp deleted file mode 100644 index 985861c09f..0000000000 --- a/src/ocl/tensorocl.cpp +++ /dev/null @@ -1,2617 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2023 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MIO_TENSOROCL_DEBUG 0 - -namespace miopen { - -TensorDescriptor GetFlattenedTensorDescriptor(const TensorDescriptor& desc) -{ - // is packed - if(desc.IsPacked()) - return {desc.GetType(), {desc.GetElementSize()}, {static_cast(1)}}; - - // start flattening tensor - std::vector flat_lengths; - std::vector flat_strides; - - auto non1_length_strides = boost::combine(desc.GetLengths(), desc.GetStrides()) | - boost::adaptors::filtered(f_length_is_not_1_t()); - - auto i = non1_length_strides.begin(); - std::size_t flat_len = boost::get<0>(*i); - auto i_previous = i++; - - // the 0-th dimension full-length doesn't matter - for(; i != non1_length_strides.end(); ++i) - { - std::size_t len = boost::get<0>(*i); - std::size_t stride = boost::get<1>(*i); - std::size_t previous_stride = boost::get<1>(*i_previous); - std::size_t full_len = previous_stride / stride; - - if(len == full_len) - { - flat_len *= len; - } - else - { - flat_lengths.push_back(flat_len); - flat_strides.push_back(previous_stride); - flat_len = len; - } - i_previous = i; - } - flat_lengths.push_back(flat_len); - flat_strides.push_back(boost::get<1>(*i_previous)); - - return {desc.GetType(), flat_lengths, flat_strides}; -} - -// Free Tensor Functions -static void CreateBitmapAndGrid(unsigned int& bitmap, - const std::vector& a_lens, - const std::vector& c_lens, - int& num_wg, - int& work, - int d) -{ - for(int i = d; i >= 0; i--) - { - if(a_lens[i] != 1) - { - bitmap |= (1 << (a_lens.size() - (i + 1))); - num_wg *= a_lens[i]; - } - else - { - work *= c_lens[i]; - } - } -} - -static bool IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_one) -{ - bool leading_ones = true; - - for(int i = first_not_one; i >= 0; i--) - { - bool is_one = (bitmap & (1 << (n_size - 1 - i))) != 0u; - leading_ones &= is_one; - } - return leading_ones; -} - -void OpTensor3d(const Handle& handle, - miopenTensorOp_t tensorOp, - const void* alpha0, - const TensorDescriptor& aTensorDesc, - ConstData_t ATensor, - const void* alpha1, - const TensorDescriptor& bTensorDesc, - ConstData_t BTensor, - const void* beta, - const TensorDescriptor& cTensorDesc, - Data_t CTensor, - const size_t Aoffset, - const size_t Boffset, - const size_t Coffset, - const bool nonStandardSquash) -{ - auto alens = aTensorDesc.GetLengths(); - auto blens = bTensorDesc.GetLengths(); - auto clens = cTensorDesc.GetLengths(); - - auto astrides = aTensorDesc.GetStrides(); - auto bstrides = bTensorDesc.GetStrides(); - auto cstrides = cTensorDesc.GetStrides(); - - auto bsize = blens.size(); - - // first_not_one is incorrect if btensor size equal to 1 - auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); - auto d = std::distance(blens.begin(), first_not_one.base()); - - // quick fix - int num_wg = first_not_one != blens.rend() - ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) - : 1; - int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); - - unsigned int bitmap = 0; - // update bitmap for first_not_one - bitmap |= (1 << (bsize - d)); - - // (d-2) is because distance starts from 1 and 0 - // also, we need to go past the "first_not_one" as that is already - // accounted for in the bitmap - CreateBitmapAndGrid(bitmap, blens, clens, num_wg, work_per_wg, static_cast(d - 2)); - -#if(MIO_TENSOROCL_DEBUG == 1) - printf("bitmap: %u\n", bitmap); - printf("work_per_wg: %d, num_wg: %d\n", work_per_wg, num_wg); -#endif - - int max_num_wg = 4096; - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; - - size_t local_threads = 256; - - std::string network_config{}; - - network_config = std::to_string(bTensorDesc.GetType()) + "-" + - std::to_string(aTensorDesc.GetType()) + "-" + std::to_string(tensorOp) + "-"; - - // for naive tensor ops - size_t RD_BLCK = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1; - const std::string data_type = GetDataType(bTensorDesc.GetType()); - const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); - - size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1)); - size_t grp_sz = (total_work + local_threads - 1) / local_threads; - - // opencl kernels are no longer supported, fallback to generic case - bool lite_applicable = grp_sz <= size_t(max_num_wg); - - bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 && - (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2]; - - bool is_squashed = nonStandardSquash && !is_lite && - (blens[0] == 1 && clens[0] == 1 && clens[1] == 1 && blens[2] == clens[2]); - - grp_sz = std::min(size_t(max_num_wg), grp_sz); - size_t glb_sz = local_threads * grp_sz; - - size_t local_threads2 = 64; - size_t total_work2 = clens[1]; - size_t grp_sz2 = (total_work2 + local_threads2 - 1) / local_threads2; - grp_sz2 = std::min(size_t(max_num_wg / grp_sz), grp_sz2); - size_t glb_sz2 = local_threads2 * grp_sz2; - - visit_float(bTensorDesc.GetType(), [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(alpha1))); - auto miopen_beta = as_float(*(static_cast(beta))); - - if(lite_applicable && is_lite) - { - - network_config += std::to_string(RD_BLCK) + "x" + std::to_string(local_threads) + "x" + - std::to_string(grp_sz) + std::to_string(local_threads2) + - std::to_string(grp_sz2); - - auto&& kernels = handle.GetKernels("Op2dTensorLite", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - - kernel(ATensor, - static_cast(astrides[1]), // a_cstride, - BTensor, - static_cast(bstrides[1]), // b_cstride, - CTensor, - static_cast(cstrides[1]), // c_cstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(total_work), - static_cast(total_work2), - static_cast(!float_equal(miopen_beta, 0.0)), - static_cast(blens[1] == 1)); - - return; - } - } - else if(is_squashed) - { - network_config += std::to_string(RD_BLCK) + "x" + std::to_string(local_threads) + "x" + - std::to_string(grp_sz); - - auto&& kernels = handle.GetKernels("Op2dTensorSquash", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - - kernel(ATensor, - BTensor, - static_cast(blens[1]), // b_c, - static_cast(bstrides[1]), // b_cstride, - CTensor, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(total_work), - static_cast(!float_equal(miopen_alpha0, 0.0)), - static_cast(!float_equal(miopen_alpha1, 0.0)), - static_cast(!float_equal(miopen_beta, 0.0))); - - return; - } - } - else - { - - network_config += std::to_string(max_num_wg) + "-" + std::to_string(local_threads) + - "x" + std::to_string(num_wg); - - auto&& kernels = handle.GetKernels("Op3dTensorGeneric", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - - kernel(ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(blens[1] == 1 ? clens[1] : blens[1]), // b_c, - static_cast(blens[2] == 1 ? clens[2] : blens[2]), // b_h, - static_cast(clens[1]), // c_c, - static_cast(clens[2]), // c_h, - static_cast(astrides[0]), // a_nstride, - static_cast(astrides[1]), // a_cstride, - static_cast(astrides[2]), // a_hstride, - static_cast(blens[0] == 1 ? 0 : bstrides[0]), // b_nstride, - static_cast(blens[1] == 1 ? 0 : bstrides[1]), // b_cstride, - static_cast(blens[2] == 1 ? 0 : bstrides[2]), // b_hstride, - static_cast(cstrides[0]), // c_nstride, - static_cast(cstrides[1]), // c_cstride, - static_cast(cstrides[2]), // c_hstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - - return; - } - } - - std::string parms = " -DMIOPEN_TYPE=" + GetDataType(bTensorDesc.GetType()); - - parms += GetDataTypeKernelParams(aTensorDesc.GetType()); - - parms += " -DMIOPEN_TENSOR_OP="; - switch(tensorOp) - { - case 0: parms += "miopenAdd"; break; - case 1: parms += "miopenMul"; break; - case 2: parms += "miopenMin"; break; - case 3: parms += "miopenMax"; break; - } - std::string program_name = "MIOpenTensorKernels.cl"; - - if(lite_applicable && is_lite) - { - parms += " -DUSE_2D_TENSOR_LITE"; - parms += " -DRD_BLCK=" + std::to_string(RD_BLCK) + " -DREAD_TYPE=" + READ_TYPE; - - const std::vector vld{local_threads, 1, 1}; - const std::vector vgd1{glb_sz, glb_sz2, 1}; - - handle.AddKernel( - "Op2dTensorLite", network_config, program_name, "Op2dTensorLite", vld, vgd1, parms)( - ATensor, - static_cast(astrides[1]), // a_cstride, - BTensor, - static_cast(bstrides[1]), // b_cstride, - CTensor, - static_cast(cstrides[1]), // c_cstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(total_work), - static_cast(total_work2), - static_cast(!float_equal(miopen_beta, 0.0)), - static_cast(blens[1] == 1)); - } - else if(is_squashed) - { - parms += " -DUSE_2D_TENSOR_SQUASH"; - parms += " -DRD_BLCK=" + std::to_string(RD_BLCK) + " -DREAD_TYPE=" + READ_TYPE; - - const std::vector vld{local_threads, 1, 1}; - const std::vector vgd1{glb_sz, 1, 1}; - - handle.AddKernel("Op2dTensorSquash", - network_config, - program_name, - "Op2dTensorSquash", - vld, - vgd1, - parms)(ATensor, - BTensor, - static_cast(blens[1]), // b_c, - static_cast(bstrides[1]), // b_cstride, - CTensor, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(total_work), - static_cast(!float_equal(miopen_alpha0, 0.0)), - static_cast(!float_equal(miopen_alpha1, 0.0)), - static_cast(!float_equal(miopen_beta, 0.0))); - } - else - { - // Special case for adding tensors in place - program_name = "MIOpenTensorKernelsHip.cpp"; - local_threads = 32; - num_wg = std::clamp( - (clens[0] * clens[1] * clens[2]) / local_threads, size_t(1), size_t(max_num_wg)); - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; - - size_t global_threads; - global_threads = num_wg * local_threads; - const std::vector vld{local_threads, 1, 1}; - const std::vector vgd{global_threads, 1, 1}; - - parms += " -DUSE_3D_TENSOR_GENERIC"; - - handle.AddKernel("Op3dTensorGeneric", - network_config, - program_name, - "Op3dTensorGeneric", - vld, - vgd, - parms)( - ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(blens[1] == 1 ? clens[1] : blens[1]), // b_c, - static_cast(blens[2] == 1 ? clens[2] : blens[2]), // b_h, - static_cast(clens[1]), // c_c, - static_cast(clens[2]), // c_h, - static_cast(astrides[0]), // a_nstride, - static_cast(astrides[1]), // a_cstride, - static_cast(astrides[2]), // a_hstride, - static_cast(blens[0] == 1 ? 0 : bstrides[0]), // b_nstride, - static_cast(blens[1] == 1 ? 0 : bstrides[1]), // b_cstride, - static_cast(blens[2] == 1 ? 0 : bstrides[2]), // b_hstride, - static_cast(cstrides[0]), // c_nstride, - static_cast(cstrides[1]), // c_cstride, - static_cast(cstrides[2]), // c_hstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - } - }); -} - -void OpTensor4d(const Handle& handle, - miopenTensorOp_t tensorOp, - const void* alpha0, - const TensorDescriptor& aTensorDesc, - ConstData_t ATensor, - const void* alpha1, - const TensorDescriptor& bTensorDesc, - ConstData_t BTensor, - const void* beta, - const TensorDescriptor& cTensorDesc, - Data_t CTensor, - const size_t Aoffset, - const size_t Boffset, - const size_t Coffset) -{ - auto blens = bTensorDesc.GetLengths(); - auto clens = cTensorDesc.GetLengths(); - auto dims = clens.size(); - - auto astrides = aTensorDesc.GetStrides(); - auto bstrides = bTensorDesc.GetStrides(); - auto bsize = blens.size(); - auto cstrides = cTensorDesc.GetStrides(); - - // first_not_one is incorrect if btensor size equal to 1 - auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); - auto d = std::distance(blens.begin(), first_not_one.base()); - - // quick fix - int num_wg = first_not_one != blens.rend() - ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) - : 1; - int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); - - unsigned int bitmap = 0; - // update bitmap for first_not_one - bitmap |= (1 << (bsize - d)); - - // (d-2) is because distance starts from 1 and 0 - // also, we need to go past the "first_not_one" as that is already - // accounted for in the bitmap - CreateBitmapAndGrid(bitmap, blens, clens, num_wg, work_per_wg, static_cast(d - 2)); - - // quick fix for btensor = <1, 1, 1, 1> - if(bTensorDesc.GetElementSize() == 1) - bitmap = 4; - -#if(MIO_TENSOROCL_DEBUG == 1) - printf("bitmap: %u\n", bitmap); - printf("work_per_wg: %d, num_wg: %d\n", work_per_wg, num_wg); -#endif - - // Forward Convolution Bias specialization - // for fwd-bias, bitmap looks like <0, 1, 0, 0> - // Is the no. of work-groups and the work for each wg balanced? - auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0; - auto incr_wg = 0; - // This block gives off indexing for 5d tensors, skipping - if(fwd_conv_bias == 1 && dims < 5 && num_wg < 640 && work_per_wg > 256 && clens[0] > 0) - { // 640 workgroups of size 256 needed to completely fill the GPU - - work_per_wg /= clens[0]; // c_n; - num_wg *= clens[0]; // c_n; - incr_wg = 1; - } - - int num_wg_orig = num_wg; - int max_num_wg = 4096; - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; - - size_t local_threads = 256; - - // Does the bitmap contain leading ones, i.e. 1,1,1,0 or 1,1,0,0 - // or 1,1,1,1 or 1,0,0,0 - bool leading_ones = IsBitmapLeadingOnes(bitmap, dims, static_cast(d - 2)); - if(leading_ones && work_per_wg < 64) - { - local_threads = 64; - } - - std::string program_name = "MIOpenTensorKernels.cl"; - - const std::vector vld{local_threads, 1, 1}; - - // Special case for adding tensors in place - size_t global_threads; - global_threads = - (static_cast(leading_ones) == 1 && (d - 1) == 3) ? num_wg : num_wg * local_threads; - global_threads = (global_threads < local_threads) ? local_threads : global_threads; - - const std::vector vgd{global_threads, 1, 1}; - - bool packed_tensor = true; - - // auto alens = aTensorDesc.GetLengths(); - packed_tensor &= aTensorDesc.IsPacked(); - packed_tensor &= bTensorDesc.IsPacked(); - packed_tensor &= cTensorDesc.IsPacked(); - - bool packed_equal_tensor = - packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize()); - -#if(MIO_TENSOROCL_DEBUG == 1) - printf("packed_tensor: %d\n", packed_tensor); - printf("equal_tensor: %d\n", bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize()); -#endif - - // for naive tensor ops - const std::string data_type = GetDataType(bTensorDesc.GetType()); - - size_t TENS_LEN = cTensorDesc.GetElementSize(); - size_t RD_BLCK = (TENS_LEN % 4 == 0) ? 4 : (TENS_LEN % 2 == 0) ? 2 : 1; - const std::string READ_TYPE = (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); - - size_t total_work = std::max(TENS_LEN / RD_BLCK, size_t(1)); - size_t grp_sz = (total_work + local_threads - 1) / local_threads; - grp_sz = std::min(size_t(max_num_wg), grp_sz); - size_t glb_sz = local_threads * grp_sz; - - std::string network_config{}; - network_config += - std::to_string(bTensorDesc.GetType()) + "-" + std::to_string(aTensorDesc.GetType()) + "-" + - std::to_string(tensorOp) + "-" + std::to_string(max_num_wg) + "-" + - ((fwd_conv_bias == 0 && packed_equal_tensor) ? "" : std::to_string(global_threads)) + "-" + - std::to_string(local_threads); - - visit_float(bTensorDesc.GetType(), [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(alpha1))); - auto miopen_beta = as_float(*(static_cast(beta))); - - if(fwd_conv_bias != 0) - { - if(packed_tensor) - { - auto&& kernels = handle.GetKernels("OpTensorFwdBias", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - kernel(ATensor, - BTensor, - static_cast(blens[1]), - CTensor, - static_cast(clens[0]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - work_per_wg, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig), - static_cast(incr_wg)); - - return; - } - } - else - { - - auto&& kernels = handle.GetKernels("OpTensorFwdBiasGeneric", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - kernel(ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(astrides[2]), - BTensor, - static_cast(blens[1]), - static_cast(bstrides[1]), - CTensor, - static_cast(clens[0]), - static_cast(clens[3]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - static_cast(cstrides[2]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - work_per_wg, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig), - static_cast(incr_wg)); - return; - } - } - } - // precede leading_ones for bitmap = 1,1,1,1 - else if(packed_equal_tensor) - { - network_config += "x" + std::to_string(grp_sz) + "x" + std::to_string(RD_BLCK); - auto&& kernels = handle.GetKernels("Op4dTensorLite", network_config); - if(!kernels.empty()) - { - auto kernel = kernels.front(); - kernel(ATensor, - BTensor, - CTensor, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(total_work), - static_cast(!float_equal(miopen_beta, 0.0))); - return; - } - } - else if(leading_ones) - { - if(packed_tensor) - { - - auto&& kernels = handle.GetKernels("OpTensorLeadingOnes", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - kernel(ATensor, - BTensor, - CTensor, - static_cast(clens[1]), - static_cast(clens[2]), - static_cast(clens[3]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - work_per_wg, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig), - bitmap); - - return; - } - } - else - { - auto&& kernels = handle.GetKernels("OpTensorLeadingOnesGeneric", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - kernel(ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(astrides[2]), - BTensor, - static_cast(bstrides[0]), - static_cast(bstrides[1]), - static_cast(bstrides[2]), - CTensor, - static_cast(clens[1]), - static_cast(clens[2]), - static_cast(clens[3]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - static_cast(cstrides[2]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - work_per_wg, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig), - bitmap); - return; - } - } - } - else - { - auto&& kernels = handle.GetKernels("Op4dTensorGeneric", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - kernel(ATensor, - static_cast(astrides[0]), // a_nstride, - static_cast(astrides[1]), // a_cstride, - static_cast(astrides[2]), // a_hstride, - BTensor, - static_cast(blens[1]), // b_c, - static_cast(blens[2]), // b_h, - static_cast(blens[3]), // b_w, - static_cast(bstrides[0]), // b_nstride, - static_cast(bstrides[1]), // b_cstride, - static_cast(bstrides[2]), // b_hstride, - CTensor, - static_cast(clens[1]), // c_c, - static_cast(clens[2]), // c_h, - static_cast(clens[3]), // c_w, - static_cast(cstrides[0]), // c_nstride, - static_cast(cstrides[1]), // c_cstride, - static_cast(cstrides[2]), // c_hstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - bitmap, - work_per_wg, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig)); - return; - } - } - - std::string parms = " -DMIOPEN_TYPE=" + GetDataType(bTensorDesc.GetType()) + - " -DMAX_NUM_WG=" + std::to_string(max_num_wg); - - parms += GetDataTypeKernelParams(aTensorDesc.GetType()); - - parms += " -DMIOPEN_TENSOR_OP="; - switch(tensorOp) - { - case 0: parms += "miopenAdd"; break; - case 1: parms += "miopenMul"; break; - case 2: parms += "miopenMin"; break; - case 3: parms += "miopenMax"; break; - } - - if(fwd_conv_bias != 0) - { - if(packed_tensor) - { - parms += " -DUSE_FWD_BIAS"; - - handle.AddKernel("OpTensorFwdBias", - network_config, - program_name, - "OpTensorFwdBias", - vld, - vgd, - parms)(ATensor, - BTensor, - static_cast(blens[1]), - CTensor, - static_cast(clens[0]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - work_per_wg, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig), - static_cast(incr_wg)); - } - else - { - parms += " -DUSE_FWD_BIAS_GENERIC"; - handle.AddKernel("OpTensorFwdBiasGeneric", - network_config, - program_name, - "OpTensorFwdBiasGeneric", - vld, - vgd, - parms)(ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(astrides[2]), - BTensor, - static_cast(blens[1]), - static_cast(bstrides[1]), - CTensor, - static_cast(clens[0]), - static_cast(clens[3]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - static_cast(cstrides[2]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - work_per_wg, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig), - static_cast(incr_wg)); - } - } - // precede leading_ones for bitmap = 1,1,1,1 - else if(packed_equal_tensor) - { - parms += " -DUSE_4D_TENSOR_LITE"; - parms += " -DRD_BLCK=" + std::to_string(RD_BLCK) + " -DREAD_TYPE=" + READ_TYPE; - - const std::vector vgd1{glb_sz, 1, 1}; - - handle.AddKernel( - "Op4dTensorLite", network_config, program_name, "Op4dTensorLite", vld, vgd1, parms)( - ATensor, - BTensor, - CTensor, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(total_work), - static_cast(!float_equal(miopen_beta, 0.0))); - } - else if(leading_ones) - { - if(packed_tensor) - { - parms += " -DUSE_LEADING_ONES"; - handle.AddKernel("OpTensorLeadingOnes", - network_config, - program_name, - "OpTensorLeadingOnes", - vld, - vgd, - parms)(ATensor, - BTensor, - CTensor, - static_cast(clens[1]), - static_cast(clens[2]), - static_cast(clens[3]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - work_per_wg, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig), - bitmap); - } - else - { - - parms += " -DUSE_LEADING_ONES_GENERIC"; - - handle.AddKernel("OpTensorLeadingOnesGeneric", - network_config, - program_name, - "OpTensorLeadingOnesGeneric", - vld, - vgd, - parms)(ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(astrides[2]), - BTensor, - static_cast(bstrides[0]), - static_cast(bstrides[1]), - static_cast(bstrides[2]), - CTensor, - static_cast(clens[1]), - static_cast(clens[2]), - static_cast(clens[3]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - static_cast(cstrides[2]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - work_per_wg, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig), - bitmap); - } - } - else - { - parms += " -DUSE_4D_TENSOR_GENERIC"; - - handle.AddKernel("Op4dTensorGeneric", - network_config, - program_name, - "Op4dTensorGeneric", - vld, - vgd, - parms)(ATensor, - static_cast(astrides[0]), // a_nstride, - static_cast(astrides[1]), // a_cstride, - static_cast(astrides[2]), // a_hstride, - BTensor, - static_cast(blens[1]), // b_c, - static_cast(blens[2]), // b_h, - static_cast(blens[3]), // b_w, - static_cast(bstrides[0]), // b_nstride, - static_cast(bstrides[1]), // b_cstride, - static_cast(bstrides[2]), // b_hstride, - CTensor, - static_cast(clens[1]), // c_c, - static_cast(clens[2]), // c_h, - static_cast(clens[3]), // c_w, - static_cast(cstrides[0]), // c_nstride, - static_cast(cstrides[1]), // c_cstride, - static_cast(cstrides[2]), // c_hstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - bitmap, - work_per_wg, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig)); - } - }); -} - -void OpTensorOther(const Handle& handle, - miopenTensorOp_t tensorOp, - const void* alpha0, - const TensorDescriptor& aTensorDesc, - ConstData_t ATensor, - const void* alpha1, - const TensorDescriptor& bTensorDesc, - ConstData_t BTensor, - const void* beta, - const TensorDescriptor& cTensorDesc, - Data_t CTensor, - const size_t Aoffset, - const size_t Boffset, - const size_t Coffset) -{ - auto blens = bTensorDesc.GetLengths(); - auto clens = cTensorDesc.GetLengths(); - - auto astrides = aTensorDesc.GetStrides(); - auto bstrides = bTensorDesc.GetStrides(); - auto bsize = blens.size(); - auto cstrides = cTensorDesc.GetStrides(); - - const bool case_1d = bsize == 1; - const bool case_2d = bsize == 2; - const bool case_5d = bsize == 5; - - const bool use_hip = case_1d || case_2d; - - // first_not_one is incorrect if btensor size equal to 1 - auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; }); - auto d = std::distance(blens.begin(), first_not_one.base()); - - // quick fix - int num_wg = first_not_one != blens.rend() - ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) - : 1; - int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies()); - - unsigned int bitmap = 0; - // update bitmap for first_not_one - bitmap |= (1 << (bsize - d)); - - // (d-2) is because distance starts from 1 and 0 - // also, we need to go past the "first_not_one" as that is already - // accounted for in the bitmap - CreateBitmapAndGrid(bitmap, blens, clens, num_wg, work_per_wg, static_cast(d - 2)); - -#if(MIO_TENSOROCL_DEBUG == 1) - printf("bitmap: %u\n", bitmap); - printf("work_per_wg: %d, num_wg: %d\n", work_per_wg, num_wg); -#endif - - int num_wg_orig = num_wg; - int max_num_wg = 4096; - - size_t local_threads = 256; - - if(case_2d) - local_threads = 32; - - if(case_1d) - num_wg = std::clamp(clens[0] / local_threads, size_t(1), size_t(max_num_wg)); - if(case_2d) - num_wg = std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg)); - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; - - const std::vector vld{local_threads, 1, 1}; - - // Special case for adding tensors in place - size_t global_threads; - global_threads = num_wg * local_threads; - - const std::vector vgd{global_threads, 1, 1}; - - std::string program_name = use_hip ? "MIOpenTensorKernelsHip.cpp" : "MIOpenTensorKernels.cl"; - - std::string network_config{}; - network_config += std::to_string(bTensorDesc.GetType()) + "-" + - std::to_string(aTensorDesc.GetType()) + "-" + std::to_string(tensorOp) + "-" + - std::to_string(global_threads) + "-" + std::to_string(local_threads); - - if(case_1d || case_2d) - { - if(aTensorDesc.AllDimsFitIntoInt()) - { - network_config += "-32bit"; - } - else - { - network_config += "-64bit"; - } - } - - visit_float(bTensorDesc.GetType(), [&](auto as_float) { - auto miopen_alpha0 = as_float(*(static_cast(alpha0))); - auto miopen_alpha1 = as_float(*(static_cast(alpha1))); - auto miopen_beta = as_float(*(static_cast(beta))); - - if(case_5d) - { - auto&& kernels = handle.GetKernels("Op5dTensorGeneric", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - kernel(ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(astrides[2]), - static_cast(astrides[3]), - BTensor, - static_cast(blens[1]), // b_c, - static_cast(blens[2]), // b_d, - static_cast(blens[3]), // b_h, - static_cast(blens[4]), // b_w, - static_cast(bstrides[0]), // b_nstride, - static_cast(bstrides[1]), // b_cstride, - static_cast(bstrides[2]), // b_dstride, - static_cast(bstrides[3]), // b_hstride, - CTensor, - static_cast(clens[1]), // c_c, - static_cast(clens[2]), // c_d, - static_cast(clens[3]), // c_h, - static_cast(clens[4]), // c_w, - static_cast(cstrides[0]), // c_nstride, - static_cast(cstrides[1]), // c_cstride, - static_cast(cstrides[2]), // c_dstride, - static_cast(cstrides[3]), // c_hstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - bitmap, - work_per_wg, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig)); - return; - } - } - else if(case_2d) - { - auto&& kernels = handle.GetKernels("Op2dTensorGeneric", network_config); - - if(!kernels.empty()) - { - auto kernel = kernels.front(); - - if(aTensorDesc.AllDimsFitIntoInt()) - { - kernel(ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(blens[1] == 1 ? clens[1] : blens[1]), - static_cast(clens[1]), - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(blens[1] == 1 ? 0 : bstrides[1]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - } - else - { - kernel(ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(blens[1] == 1 ? clens[1] : blens[1]), - static_cast(clens[1]), - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(blens[1] == 1 ? 0 : bstrides[1]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - } - - return; - } - } - else if(case_1d) - { - auto&& kernels = handle.GetKernels("Op1dTensorGeneric", network_config); - - if(!kernels.empty()) - { - - auto kernel = kernels.front(); - - if(aTensorDesc.AllDimsFitIntoInt()) - { - kernel(ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(astrides[0]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(cstrides[0]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - } - else - { - kernel(ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(astrides[0]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(cstrides[0]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - } - - return; - } - } - - std::string parms = " -DMIOPEN_TYPE=" + GetDataType(bTensorDesc.GetType()) + - " -DMAX_NUM_WG=" + std::to_string(max_num_wg); - - parms += GetDataTypeKernelParams(aTensorDesc.GetType()); - - parms += " -DMIOPEN_TENSOR_OP="; - switch(tensorOp) - { - case 0: parms += "miopenAdd"; break; - case 1: parms += "miopenMul"; break; - case 2: parms += "miopenMin"; break; - case 3: parms += "miopenMax"; break; - } - - if(aTensorDesc.AllDimsFitIntoInt()) - { - parms += " -DDIM_TYPE=uint32_t"; - } - else - { - parms += " -DDIM_TYPE=uint64_t"; - } - - if(case_5d) - { - parms += " -DUSE_5D_TENSOR_GENERIC"; - - handle.AddKernel("Op5dTensorGeneric", - network_config, - program_name, - "Op5dTensorGeneric", - vld, - vgd, - parms)(ATensor, - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(astrides[2]), - static_cast(astrides[3]), - BTensor, - static_cast(blens[1]), // b_c, - static_cast(blens[2]), // b_d, - static_cast(blens[3]), // b_h, - static_cast(blens[4]), // b_w, - static_cast(bstrides[0]), // b_nstride, - static_cast(bstrides[1]), // b_cstride, - static_cast(bstrides[2]), // b_dstride, - static_cast(bstrides[3]), // b_hstride, - CTensor, - static_cast(clens[1]), // c_c, - static_cast(clens[2]), // c_d, - static_cast(clens[3]), // c_h, - static_cast(clens[4]), // c_w, - static_cast(cstrides[0]), // c_nstride, - static_cast(cstrides[1]), // c_cstride, - static_cast(cstrides[2]), // c_dstride, - static_cast(cstrides[3]), // c_hstride, - miopen_alpha0, - miopen_alpha1, - miopen_beta, - bitmap, - work_per_wg, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(num_wg_orig)); - } - else if(case_2d) - { - parms += " -DUSE_2D_TENSOR_GENERIC"; - - if(aTensorDesc.AllDimsFitIntoInt()) - { - handle.AddKernel("Op2dTensorGeneric", - network_config, - program_name, - "Op2dTensorGeneric", - vld, - vgd, - parms)(ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(blens[1] == 1 ? clens[1] : blens[1]), - static_cast(clens[1]), - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(blens[1] == 1 ? 0 : bstrides[1]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - } - else - { - handle.AddKernel("Op2dTensorGeneric", - network_config, - program_name, - "Op2dTensorGeneric", - vld, - vgd, - parms)(ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(blens[1] == 1 ? clens[1] : blens[1]), - static_cast(clens[1]), - static_cast(astrides[0]), - static_cast(astrides[1]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(blens[1] == 1 ? 0 : bstrides[1]), - static_cast(cstrides[0]), - static_cast(cstrides[1]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - } - } - else if(case_1d) - { - parms += " -DUSE_1D_TENSOR_GENERIC"; - - if(aTensorDesc.AllDimsFitIntoInt()) - { - handle.AddKernel("Op1dTensorGeneric", - network_config, - program_name, - "Op1dTensorGeneric", - vld, - vgd, - parms)(ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(astrides[0]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(cstrides[0]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - } - else - { - handle.AddKernel("Op1dTensorGeneric", - network_config, - program_name, - "Op1dTensorGeneric", - vld, - vgd, - parms)(ATensor, - BTensor, - CTensor, - static_cast(Aoffset), - static_cast(Boffset), - static_cast(Coffset), - static_cast(astrides[0]), - static_cast(blens[0] == 1 ? 0 : bstrides[0]), - static_cast(cstrides[0]), - miopen_alpha0, - miopen_alpha1, - miopen_beta, - static_cast(clens[0]), - !float_equal(miopen_beta, 0.0)); - } - } - }); -} - -void OpTensor(const Handle& handle, - miopenTensorOp_t tensorOp, - const void* alpha0, - const TensorDescriptor& aTensorDesc, - ConstData_t ATensor, - const void* alpha1, - const TensorDescriptor& bTensorDesc, - ConstData_t BTensor, - const void* beta, - const TensorDescriptor& cTensorDesc, - Data_t CTensor, - const size_t Aoffset, - const size_t Boffset, - const size_t Coffset, - bool nonStandardSquash) -{ - if(ATensor == nullptr || BTensor == nullptr || CTensor == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm); - } - - // if(aTensorDesc != cTensorDesc) - if(aTensorDesc.GetElementSize() != cTensorDesc.GetElementSize()) - { - MIOPEN_THROW("A and C Tensors do not match"); - } - - if(bTensorDesc.GetType() != cTensorDesc.GetType()) - { - MIOPEN_THROW("Datatypes for B and C tensors do not match !"); - } - - auto blens = bTensorDesc.GetLengths(); -#if(MIO_TENSOROCL_DEBUG == 1) - printf("blen:["); - for(auto len : blens) - { - printf(" %lu", len); - } - printf("]\n"); -#endif - auto clens = cTensorDesc.GetLengths(); - - if(clens.size() > 5) - { - MIOPEN_THROW("Tensor dimension larger than 5: " + std::to_string(clens.size())); - } - - if(blens.size() != clens.size()) - { - MIOPEN_THROW("Number of dims in B and C Tensors do not match: " + - std::to_string(blens.size()) + ", " + std::to_string(clens.size())); - } - - if(!nonStandardSquash) - { - for(std::size_t i = 0; i < clens.size(); i++) - { - if(blens[i] != 1 && blens[i] != clens[i]) - { - MIOPEN_THROW("BTensor dim != 1 && BTensor dim != CTensor dim: " + - std::to_string(i)); - } - } - } - else - { - // non standard behavior because blens[1] can be not equalt to clens[1] - if(!(clens.size() == 3 && blens[0] == 1 && clens[0] == 1 && blens[2] == clens[2])) - { - MIOPEN_THROW("Non standard squashed operation supported only for 3d tensors and for " - "the specific configuration"); - } - } - - auto bsize = blens.size(); - if(bsize == 3) - { - OpTensor3d(handle, - tensorOp, - alpha0, - aTensorDesc, - ATensor, - alpha1, - bTensorDesc, - BTensor, - beta, - cTensorDesc, - CTensor, - Aoffset, - Boffset, - Coffset, - nonStandardSquash); - } - else if(bsize == 4) - { - OpTensor4d(handle, - tensorOp, - alpha0, - aTensorDesc, - ATensor, - alpha1, - bTensorDesc, - BTensor, - beta, - cTensorDesc, - CTensor, - Aoffset, - Boffset, - Coffset); - } - else - { - OpTensorOther(handle, - tensorOp, - alpha0, - aTensorDesc, - ATensor, - alpha1, - bTensorDesc, - BTensor, - beta, - cTensorDesc, - CTensor, - Aoffset, - Boffset, - Coffset); - } -} - -struct two_exp_ceiling_t -{ - std::size_t operator()(std::size_t n) const - { - assert(n > 0); - - std::size_t i = 1; - - n--; - while(n != 0) - { - i *= 2; - n /= 2; - } - - return i; - } -}; - -static std::vector get_worker_sizes(const std::vector& data_sizes) -{ - const std::size_t dim = data_sizes.size(); - - std::vector worker_sizes(dim); - - std::transform(data_sizes.begin(), data_sizes.end(), worker_sizes.begin(), two_exp_ceiling_t{}); - - std::size_t wgd = std::accumulate( - worker_sizes.begin(), worker_sizes.end(), std::size_t{1}, std::multiplies()); - - if(wgd > 65536) - { - std::size_t n = wgd / 65536; - - int i = 0; - while(n > 1 && i < dim) - { - std::size_t size_old = worker_sizes[i]; - worker_sizes[i] = (size_old - 1) / n + 1; - n /= size_old / worker_sizes[i]; - ++i; - } - } - - return worker_sizes; -} - -void SetTensor(const Handle& handle, - const TensorDescriptor& yDesc, - Data_t y, - const void* alpha, - const int offset) -{ - if(y == nullptr || alpha == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm); - } - - const TensorDescriptor yDesc_flat = GetFlattenedTensorDescriptor(yDesc); - -#ifndef NDEBUG - if(yDesc.GetNumDims() != yDesc_flat.GetNumDims()) - { - MIOPEN_LOG_I2("real descriptor: " << yDesc); - MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat); - } -#endif - - const std::size_t yDim_flat = yDesc_flat.GetNumDims(); - - assert(yDim_flat > 0 && yDim_flat <= 5); - - std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d"; - - const miopenDataType_t dataType = yDesc_flat.GetType(); - - std::string network_config = "set " + std::to_string(dataType); - for(auto& len : yDesc_flat.GetLengths()) - { - network_config += " " + std::to_string(len); - } - - auto&& kernels = handle.GetKernels(kernel_name, network_config); - - KernelInvoke kernel; - - if(!kernels.empty()) - { - kernel = kernels.front(); - } - else - { - std::string program_name = "MIOpenSubTensorOpWithScalarKernel.cl"; - - std::vector worker_sizes = get_worker_sizes(yDesc_flat.GetLengths()); - - std::size_t wgd = std::accumulate(worker_sizes.begin(), - worker_sizes.end(), - std::size_t{1}, - std::multiplies()); - - std::size_t wld = 256 < wgd ? 256 : wgd; - std::stringstream ss; - ss << "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET" - << GetDataTypeKernelParams(dataType); - for(int i = 0; i < yDim_flat; ++i) - { - ss << " -DWORK_LENGTH_" << std::to_string(i) << "=" << std::to_string(worker_sizes[i]); - } - - kernel = handle.AddKernel(kernel_name, - network_config, - program_name, - kernel_name, - {wld, 1, 1}, - {wgd, 1, 1}, - ss.str()); - } - - switch(yDim_flat) - { - case 1: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetLengths()[0])); - }); - - break; - } - case 2: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1])); - }); - - break; - } - case 3: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1]), - static_cast(yDesc_flat.GetLengths()[2])); - }); - - break; - } - case 4: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetStrides()[3]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1]), - static_cast(yDesc_flat.GetLengths()[2]), - static_cast(yDesc_flat.GetLengths()[3])); - }); - - break; - } - case 5: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetStrides()[3]), - static_cast(yDesc_flat.GetStrides()[4]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1]), - static_cast(yDesc_flat.GetLengths()[2]), - static_cast(yDesc_flat.GetLengths()[3]), - static_cast(yDesc_flat.GetLengths()[4])); - }); - - break; - } - default: assert(false); - } -} - -void ScaleTensor(const Handle& handle, - const TensorDescriptor& yDesc, - Data_t y, - const void* alpha, - const int offset) -{ - if(y == nullptr || alpha == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm); - } - - const TensorDescriptor yDesc_flat = GetFlattenedTensorDescriptor(yDesc); - -#ifndef NDEBUG - if(yDesc.GetNumDims() != yDesc_flat.GetNumDims()) - { - MIOPEN_LOG_I2("real descriptor: " << yDesc); - MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat); - } -#endif - - const std::size_t yDim_flat = yDesc_flat.GetNumDims(); - - assert(yDim_flat > 0 && yDim_flat <= 5); - - const miopenDataType_t dataType = yDesc_flat.GetType(); - - if(!(dataType == miopenHalf // - || dataType == miopenFloat // - || dataType == miopenInt32 // - || dataType == miopenDouble)) - { - MIOPEN_THROW(miopenStatusBadParm, "ScaleTensor: unsupported data type."); - } - - std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d"; - - const std::vector& lens = yDesc_flat.GetLengths(); - - std::string network_config = "scale " + std::to_string(yDesc_flat.GetType()); - for(auto& len : lens) - { - network_config += " " + std::to_string(len); - } - - auto&& kernels = handle.GetKernels(kernel_name, network_config); - - KernelInvoke kernel; - - if(!kernels.empty()) - { - kernel = kernels.front(); - } - else - { - std::string program_name = "MIOpenSubTensorOpWithScalarKernel.cl"; - - std::vector worker_sizes = get_worker_sizes(lens); - - std::size_t wgd = std::accumulate(worker_sizes.begin(), - worker_sizes.end(), - std::size_t{1}, - std::multiplies()); - - std::size_t wld = 256 < wgd ? 256 : wgd; - - std::string parms = "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_MULTIPLY" + - GetDataTypeKernelParams(dataType); - for(int i = 0; i < yDim_flat; ++i) - { - parms += " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]); - } - - kernel = handle.AddKernel(kernel_name, - network_config, - program_name, - kernel_name, - {wld, 1, 1}, - {wgd, 1, 1}, - parms); - } - - switch(yDim_flat) - { - case 1: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetLengths()[0])); - }); - - break; - } - case 2: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1])); - }); - - break; - } - case 3: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1]), - static_cast(yDesc_flat.GetLengths()[2])); - }); - - break; - } - case 4: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetStrides()[3]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1]), - static_cast(yDesc_flat.GetLengths()[2]), - static_cast(yDesc_flat.GetLengths()[3])); - }); - - break; - } - case 5: { - visit_float(dataType, [&](auto as_float) { - kernel(y, - *as_float(alpha), - offset, - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetStrides()[3]), - static_cast(yDesc_flat.GetStrides()[4]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1]), - static_cast(yDesc_flat.GetLengths()[2]), - static_cast(yDesc_flat.GetLengths()[3]), - static_cast(yDesc_flat.GetLengths()[4])); - }); - - break; - } - default: assert(false); - } -} - -void CopyTensor(const Handle& handle, - const TensorDescriptor& srcDesc, - ConstData_t src, - const TensorDescriptor& dstDesc, - Data_t dst, - int srcOffset, - int dstOffset, - bool forseAsync) -{ - if(src == nullptr || dst == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor."); - } - - if(srcDesc.GetType() != dstDesc.GetType()) - { - MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match."); - } - - if(srcDesc.GetLengths() != dstDesc.GetLengths()) - { - MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match."); - } - - auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc); - const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors); - const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors); - -#ifndef NDEBUG - if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims()) - { - MIOPEN_LOG_I2("src real descriptor: " << srcDesc); - MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat); - MIOPEN_LOG_I2("dst real descriptor: " << dstDesc); - MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat); - } -#endif - - std::size_t srcDim_flat = srcDesc_flat.GetNumDims(); - - if(srcDim_flat < 1 || srcDim_flat > 5) - { - MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported."); - } - - if(forseAsync || srcOffset > 0 || dstOffset > 0 || - (!(srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked()))) - { - std::string kernel_name = "SubTensorOpWithSubTensor" + std::to_string(srcDim_flat) + "d"; - - const std::vector& lens = srcDesc_flat.GetLengths(); - - std::string network_config = "copy " + std::to_string(srcDesc_flat.GetType()); - for(auto& len : lens) - { - network_config += " " + std::to_string(len); - } - - auto&& kernels = handle.GetKernels(kernel_name, network_config); - - KernelInvoke kernel; - - if(!kernels.empty()) - { - kernel = kernels.front(); - } - else - { - std::string program_name = "MIOpenSubTensorOpWithSubTensorKernel.cl"; - - std::vector worker_sizes = get_worker_sizes(lens); - - std::size_t wgd = std::accumulate(worker_sizes.begin(), - worker_sizes.end(), - std::size_t{1}, - std::multiplies()); - - std::size_t wld = 256 < wgd ? 256 : wgd; - - std::string parms = "-DSUBTENSOR_OP_WITH_SUBTENSOR=SUBTENSOR_OP_WITH_SUBTENSOR_COPY" + - GetDataTypeKernelParams(srcDesc_flat.GetType()); - for(std::size_t i = 0; i < srcDim_flat; ++i) - { - parms += - " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]); - } - - kernel = handle.AddKernel(kernel_name, - network_config, - program_name, - kernel_name, - {wld, 1, 1}, - {wgd, 1, 1}, - parms); - } - - switch(srcDim_flat) - { - case 1: { - kernel(src, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetLengths()[0]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0])); - - break; - } - case 2: { - kernel(src, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetStrides()[1]), - static_cast(srcDesc_flat.GetLengths()[0]), - static_cast(srcDesc_flat.GetLengths()[1]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0]), - static_cast(dstDesc_flat.GetStrides()[1])); - - break; - } - case 3: { - kernel(src, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetStrides()[1]), - static_cast(srcDesc_flat.GetStrides()[2]), - static_cast(srcDesc_flat.GetLengths()[0]), - static_cast(srcDesc_flat.GetLengths()[1]), - static_cast(srcDesc_flat.GetLengths()[2]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0]), - static_cast(dstDesc_flat.GetStrides()[1]), - static_cast(dstDesc_flat.GetStrides()[2])); - - break; - } - case 4: { - kernel(src, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetStrides()[1]), - static_cast(srcDesc_flat.GetStrides()[2]), - static_cast(srcDesc_flat.GetStrides()[3]), - static_cast(srcDesc_flat.GetLengths()[0]), - static_cast(srcDesc_flat.GetLengths()[1]), - static_cast(srcDesc_flat.GetLengths()[2]), - static_cast(srcDesc_flat.GetLengths()[3]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0]), - static_cast(dstDesc_flat.GetStrides()[1]), - static_cast(dstDesc_flat.GetStrides()[2]), - static_cast(dstDesc_flat.GetStrides()[3])); - - break; - } - case 5: { - kernel(src, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetStrides()[1]), - static_cast(srcDesc_flat.GetStrides()[2]), - static_cast(srcDesc_flat.GetStrides()[3]), - static_cast(srcDesc_flat.GetStrides()[4]), - static_cast(srcDesc_flat.GetLengths()[0]), - static_cast(srcDesc_flat.GetLengths()[1]), - static_cast(srcDesc_flat.GetLengths()[2]), - static_cast(srcDesc_flat.GetLengths()[3]), - static_cast(srcDesc_flat.GetLengths()[4]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0]), - static_cast(dstDesc_flat.GetStrides()[1]), - static_cast(dstDesc_flat.GetStrides()[2]), - static_cast(dstDesc_flat.GetStrides()[3]), - static_cast(dstDesc_flat.GetStrides()[4])); - - break; - } - default: assert(false); - } - } - else - { - handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType())); - } -} - -std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, miopenDataType_t type) -{ - std::string option(buildOption); - switch(type) - { - case miopenInt8: return option += "0"; - case miopenInt32: return option += "1"; - case miopenHalf: return option += "2"; - case miopenFloat: return option += "3"; - case miopenBFloat16: return option += "4"; - case miopenFloat8: - MIOPEN_THROW(miopenStatusBadParm, "miopenFloat8 data type not supported in cast tensor."); - case miopenBFloat8: - MIOPEN_THROW(miopenStatusBadParm, "miopenBFloat8 data type not supported in cast tensor."); - case miopenDouble: - // TODO - MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor."); - case miopenInt64: - MIOPEN_THROW(miopenStatusBadParm, "miopenInt64 data type not supported in cast tensor."); - default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc."); - } -} - -void CastTensor(const Handle& handle, - const void* alpha, - const bool clamping, - const TensorDescriptor& srcDesc, - ConstData_t src, - const TensorDescriptor& dstDesc, - Data_t dst, - int srcOffset, - int dstOffset) -{ - if(src == nullptr || dst == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor."); - } - - if(srcDesc.GetLengths() != dstDesc.GetLengths()) - { - MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match."); - } - - auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc); - const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors); - const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors); - -#ifndef NDEBUG - if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims()) - { - MIOPEN_LOG_I2("src real descriptor: " << srcDesc); - MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat); - MIOPEN_LOG_I2("dst real descriptor: " << dstDesc); - MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat); - } -#endif - - std::size_t srcDim_flat = srcDesc_flat.GetNumDims(); - - if(srcDim_flat < 1 || srcDim_flat > 5) - { - MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported."); - } - - if(srcDesc.GetType() == dstDesc.GetType() && srcOffset == 0 && dstOffset == 0 && - srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked()) - { - handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType())); - } - else - { - std::string kernel_name = "SubTensorOpWithCastTensor" + std::to_string(srcDim_flat) + "d"; - - const std::vector& lens = srcDesc_flat.GetLengths(); - - std::string network_config = "cast " + std::to_string(dstDesc_flat.GetType()); - for(auto& len : lens) - { - network_config += " " + std::to_string(len); - } - - auto&& kernels = handle.GetKernels(kernel_name, network_config); - KernelInvoke kernel; - - auto miopen_alpha = *(static_cast(alpha)); - - if(!kernels.empty()) - { - kernel = kernels.front(); - } - else - { - std::string program_name = "MIOpenSubTensorOpWithCastTensorKernel.cl"; - - std::vector worker_sizes = get_worker_sizes(lens); - - std::size_t wgd = std::accumulate(worker_sizes.begin(), - worker_sizes.end(), - std::size_t{1}, - std::multiplies()); - - std::size_t wld = 256 < wgd ? 256 : wgd; - - std::string parms = - GetCastTensorBuildOptionFromType(" -DMIOPEN_SRC_TYPE=", srcDesc_flat.GetType()) + - GetCastTensorBuildOptionFromType(" -DMIOPEN_DST_TYPE=", dstDesc_flat.GetType()); - - for(std::size_t i = 0; i < srcDim_flat; ++i) - { - parms += - " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]); - } - - if(dstDesc_flat.GetType() == miopenBFloat16) - { - parms += " -DMIOPEN_USE_RNE_BFLOAT16=1"; - } - - kernel = handle.AddKernel(kernel_name, - network_config, - program_name, - kernel_name, - {wld, 1, 1}, - {wgd, 1, 1}, - parms); - } - - const int clamping_arg = clamping ? 1 : 0; - switch(srcDim_flat) - { - case 1: { - kernel(src, - miopen_alpha, - clamping_arg, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetLengths()[0]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0])); - - break; - } - case 2: { - kernel(src, - miopen_alpha, - clamping_arg, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetStrides()[1]), - static_cast(srcDesc_flat.GetLengths()[0]), - static_cast(srcDesc_flat.GetLengths()[1]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0]), - static_cast(dstDesc_flat.GetStrides()[1])); - - break; - } - case 3: { - kernel(src, - miopen_alpha, - clamping_arg, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetStrides()[1]), - static_cast(srcDesc_flat.GetStrides()[2]), - static_cast(srcDesc_flat.GetLengths()[0]), - static_cast(srcDesc_flat.GetLengths()[1]), - static_cast(srcDesc_flat.GetLengths()[2]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0]), - static_cast(dstDesc_flat.GetStrides()[1]), - static_cast(dstDesc_flat.GetStrides()[2])); - - break; - } - case 4: { - kernel(src, - miopen_alpha, - clamping_arg, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetStrides()[1]), - static_cast(srcDesc_flat.GetStrides()[2]), - static_cast(srcDesc_flat.GetStrides()[3]), - static_cast(srcDesc_flat.GetLengths()[0]), - static_cast(srcDesc_flat.GetLengths()[1]), - static_cast(srcDesc_flat.GetLengths()[2]), - static_cast(srcDesc_flat.GetLengths()[3]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0]), - static_cast(dstDesc_flat.GetStrides()[1]), - static_cast(dstDesc_flat.GetStrides()[2]), - static_cast(dstDesc_flat.GetStrides()[3])); - - break; - } - case 5: { - kernel(src, - miopen_alpha, - clamping_arg, - srcOffset, - static_cast(srcDesc_flat.GetStrides()[0]), - static_cast(srcDesc_flat.GetStrides()[1]), - static_cast(srcDesc_flat.GetStrides()[2]), - static_cast(srcDesc_flat.GetStrides()[3]), - static_cast(srcDesc_flat.GetStrides()[4]), - static_cast(srcDesc_flat.GetLengths()[0]), - static_cast(srcDesc_flat.GetLengths()[1]), - static_cast(srcDesc_flat.GetLengths()[2]), - static_cast(srcDesc_flat.GetLengths()[3]), - static_cast(srcDesc_flat.GetLengths()[4]), - dst, - dstOffset, - static_cast(dstDesc_flat.GetStrides()[0]), - static_cast(dstDesc_flat.GetStrides()[1]), - static_cast(dstDesc_flat.GetStrides()[2]), - static_cast(dstDesc_flat.GetStrides()[3]), - static_cast(dstDesc_flat.GetStrides()[4])); - - break; - } - default: assert(false); - } - } -} - -void TransformTensor(const Handle& handle, - const void* alpha, - const TensorDescriptor& xDesc, - ConstData_t x, - const void* beta, - const TensorDescriptor& yDesc, - Data_t y, - size_t Xoffset, - size_t Yoffset) -{ - if(x == nullptr || y == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm); - } - - if(alpha == nullptr || beta == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm); - } - - auto x_len = xDesc.GetLengths(); - auto y_len = yDesc.GetLengths(); - - if(x_len.size() != y_len.size()) - { - MIOPEN_THROW("Tensor dimension must be the same"); - } - - if(x_len[0] != y_len[0]) - { - MIOPEN_THROW("Tensor x and y batch sizes do not match"); - } - - const auto is_alpha_one = float_equal(*(static_cast(alpha)), 1); - const auto is_beta_zero = float_equal(*(static_cast(beta)), 0); - - if(xDesc.GetType() == miopenInt8 && yDesc.GetType() == miopenInt8 && x_len.size() >= 3) - { - if(x_len[1] <= y_len[1]) - { - if(x_len[1] <= (y_len[1] - 4) || y_len[1] % 4 != 0) - { - MIOPEN_THROW("Invalid y channel size"); - } - - int8_t zero = 0; - SetTensor(handle, yDesc, y, &zero); - } - else if(x_len[1] % 4 != 0) - { - MIOPEN_THROW("Invalid x channel size"); - } - - size_t batch_n = x_len[0]; - - x_len[0] = 1; - y_len[0] = 1; - - miopen::TensorDescriptor x_batch_desc, y_batch_desc; - x_batch_desc = miopen::TensorDescriptor(miopenInt8, x_len); - y_batch_desc = miopen::TensorDescriptor(miopenInt8, y_len); - - size_t x_batch_sz = x_batch_desc.GetElementSize(); - size_t y_batch_sz = y_batch_desc.GetElementSize(); - - for(size_t i = 0; i < batch_n; i++) - { - size_t x_offset = i * x_batch_sz; - size_t y_offset = i * y_batch_sz; - - if(is_alpha_one && is_beta_zero) - { - CopyTensor(handle, - ((x_len[1] <= y_len[1]) ? x_batch_desc : y_batch_desc), - x, - ((x_len[1] <= y_len[1]) ? x_batch_desc : y_batch_desc), - y, - x_offset, - y_offset); - } - else - { - MIOPEN_THROW(miopenStatusNotImplemented, - "y=alpha*x+beta*y is not supported for int8 yet"); - } - } - } - else - { - auto x_y_len = boost::combine(x_len, y_len); - bool same_spatial_len = std::all_of(x_y_len.begin(), x_y_len.end(), [](auto v) { - return boost::get<0>(v) == boost::get<1>(v); - }); - - if(!same_spatial_len) - { - MIOPEN_THROW("Tensor x and y spatial sizes do not match"); - } - - auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(xDesc, yDesc); - const TensorDescriptor& xDesc_flat = std::get<0>(flat_descriptors); - const TensorDescriptor& yDesc_flat = std::get<1>(flat_descriptors); - - if(xDesc.GetNumDims() != xDesc_flat.GetNumDims()) - { - MIOPEN_LOG_I2("x real descriptor: " << xDesc); - MIOPEN_LOG_I2("x flat descriptor: " << xDesc_flat); - } - - if(yDesc.GetNumDims() != yDesc_flat.GetNumDims()) - { - MIOPEN_LOG_I2("y real descriptor: " << yDesc); - MIOPEN_LOG_I2("y flat descriptor: " << yDesc_flat); - } - - const std::size_t yDim_flat = yDesc_flat.GetNumDims(); - - assert(yDim_flat > 0 && yDim_flat <= 5); - - const miopenDataType_t dataTypex = xDesc_flat.GetType(); - const miopenDataType_t dataTypey = yDesc_flat.GetType(); - - if(!(dataTypex == miopenHalf // - || dataTypex == miopenFloat // - || dataTypex == miopenInt32 // - || dataTypex == miopenBFloat16 // - || dataTypex == miopenDouble)) - { - MIOPEN_THROW("Tensor x is a unsupported data type"); - } - - if(!(dataTypey == miopenHalf // - || dataTypey == miopenFloat // - || dataTypey == miopenInt32 // - || dataTypey == miopenBFloat16 // - || dataTypey == miopenDouble)) - { - MIOPEN_THROW("Tensor y is a unsupported data type"); - } - - if(dataTypex != dataTypey) - { - MIOPEN_THROW("Tensor x and y have different data types"); - } - - std::string kernel_name = "SubTensorOpWithTransform" + std::to_string(yDim_flat) + "d"; - - const std::vector& lens = yDesc_flat.GetLengths(); - - std::string network_config = "transform " + std::to_string(yDesc_flat.GetType()); - for(auto& len : lens) - { - network_config += "x" + std::to_string(len); - } - - if(is_beta_zero) - network_config += "xBETA_IS_ZERO"; - if(is_alpha_one) - network_config += "xALPHA_IS_ONE"; - - auto&& kernels = handle.GetKernels(kernel_name, network_config); - - KernelInvoke kernel; - - if(!kernels.empty()) - { - kernel = kernels.front(); - } - else - { - std::string program_name = "MIOpenSubTensorOpWithTransformKernel.cl"; - - std::vector worker_sizes = get_worker_sizes(lens); - - std::size_t wgd = std::accumulate(worker_sizes.begin(), - worker_sizes.end(), - std::size_t{1}, - std::multiplies()); - - std::size_t wld = 256 < wgd ? 256 : wgd; - - std::string parms = - GetDataTypeKernelParams(dataTypey) // - + " -DMIOPEN_BETA_IS_ZERO=" + std::to_string(static_cast(is_beta_zero)) // - + " -DMIOPEN_ALPHA_IS_ONE=" + std::to_string(static_cast(is_alpha_one)); - - for(int i = 0; i < yDim_flat; ++i) - { - parms += - " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]); - } - - kernel = handle.AddKernel(kernel_name, - network_config, - program_name, - kernel_name, - {wld, 1, 1}, - {wgd, 1, 1}, - parms); - } - - switch(yDim_flat) - { - case 1: { - visit_float(dataTypey, [&](auto as_float) { - kernel(x, - *as_float(alpha), - y, - *as_float(beta), - static_cast(Xoffset), - static_cast(Yoffset), - static_cast(xDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetLengths()[0])); - }); - - break; - } - case 2: { - visit_float(dataTypey, [&](auto as_float) { - kernel(x, - *as_float(alpha), - y, - *as_float(beta), - static_cast(Xoffset), - static_cast(Yoffset), - static_cast(xDesc_flat.GetStrides()[0]), - static_cast(xDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1])); - }); - - break; - } - case 3: { - visit_float(dataTypey, [&](auto as_float) { - kernel(x, - *as_float(alpha), - y, - *as_float(beta), - static_cast(Xoffset), - static_cast(Yoffset), - static_cast(xDesc_flat.GetStrides()[0]), - static_cast(xDesc_flat.GetStrides()[1]), - static_cast(xDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1]), - static_cast(yDesc_flat.GetLengths()[2])); - }); - - break; - } - case 4: { - visit_float(dataTypey, [&](auto as_float) { - kernel(x, - *as_float(alpha), - y, - *as_float(beta), - static_cast(Xoffset), - static_cast(Yoffset), - static_cast(xDesc_flat.GetStrides()[0]), - static_cast(xDesc_flat.GetStrides()[1]), - static_cast(xDesc_flat.GetStrides()[2]), - static_cast(xDesc_flat.GetStrides()[3]), - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetStrides()[3]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1]), - static_cast(yDesc_flat.GetLengths()[2]), - static_cast(yDesc_flat.GetLengths()[3])); - }); - - break; - } - case 5: { - visit_float(dataTypey, [&](auto as_float) { - kernel(x, - *as_float(alpha), - y, - *as_float(beta), - static_cast(Xoffset), - static_cast(Yoffset), - static_cast(xDesc_flat.GetStrides()[0]), - static_cast(xDesc_flat.GetStrides()[1]), - static_cast(xDesc_flat.GetStrides()[2]), - static_cast(xDesc_flat.GetStrides()[3]), - static_cast(xDesc_flat.GetStrides()[4]), - static_cast(yDesc_flat.GetStrides()[0]), - static_cast(yDesc_flat.GetStrides()[1]), - static_cast(yDesc_flat.GetStrides()[2]), - static_cast(yDesc_flat.GetStrides()[3]), - static_cast(yDesc_flat.GetStrides()[4]), - static_cast(yDesc_flat.GetLengths()[0]), - static_cast(yDesc_flat.GetLengths()[1]), - static_cast(yDesc_flat.GetLengths()[2]), - static_cast(yDesc_flat.GetLengths()[3]), - static_cast(yDesc_flat.GetLengths()[4])); - }); - - break; - } - default: assert(false); - } - } -} - -} // namespace miopen diff --git a/src/rnn/Solutions/Base/bw_data_modular.cpp b/src/rnn/Solutions/Base/bw_data_modular.cpp index 04bbfd780e..0f840e98d2 100644 --- a/src/rnn/Solutions/Base/bw_data_modular.cpp +++ b/src/rnn/Solutions/Base/bw_data_modular.cpp @@ -62,7 +62,7 @@ void RNNBackwardDataModularAlgo::PrepareWriteBuffers(const Handle& handle, } } -void RNNBackwardDataModularAlgo::PropDhy(const Handle& handle, +void RNNBackwardDataModularAlgo::PropDhy(Handle& handle, ConstData_t dhy, Data_t workSpace, unsigned int layer, @@ -295,7 +295,7 @@ void RNNBackwardDataModularAlgo::UpdateHStatePerTimeSeq(const Handle& handle, } } -void RNNBackwardDataModularAlgo::PropDhxDcx(const Handle& handle, +void RNNBackwardDataModularAlgo::PropDhxDcx(Handle& handle, ConstData_t w, Data_t dhx, Data_t dcx, diff --git a/src/rnn/Solutions/Base/bw_weights_modular.cpp b/src/rnn/Solutions/Base/bw_weights_modular.cpp index 598d002cb0..76a37e6630 100644 --- a/src/rnn/Solutions/Base/bw_weights_modular.cpp +++ b/src/rnn/Solutions/Base/bw_weights_modular.cpp @@ -32,7 +32,7 @@ namespace miopen { namespace rnn_base { -miopenStatus_t ReducAddBias(const miopen::Handle& handle, +miopenStatus_t ReducAddBias(miopen::Handle& handle, Data_t dw, const Data_t workSpace, const miopen::TensorDescriptor& dw_desc, @@ -243,7 +243,7 @@ void RNNBackwardWeightsModularAlgo::HiddenXInputWeights(const Handle& handle, } void RNNBackwardWeightsModularAlgo::BiasUpdate( - const Handle& handle, Data_t dw, Data_t workSpace, size_t layer, size_t workSpaceSize) const + Handle& handle, Data_t dw, Data_t workSpace, size_t layer, size_t workSpaceSize) const { if(rnnDesc.biasMode != 0u) { diff --git a/src/rnn/Solutions/Base/fw_data_modular.cpp b/src/rnn/Solutions/Base/fw_data_modular.cpp index ca6d18d294..450ab0be8b 100644 --- a/src/rnn/Solutions/Base/fw_data_modular.cpp +++ b/src/rnn/Solutions/Base/fw_data_modular.cpp @@ -59,13 +59,13 @@ void RNNForwardDataModularAlgo::PrepareWriteBuffers(const Handle& handle, } } -void RNNForwardDataModularAlgo::PropX(const Handle& handle, const runtimeArgsFwd& runtimeArgs) const +void RNNForwardDataModularAlgo::PropX(Handle& handle, const runtimeArgsFwd& runtimeArgs) const { const size_t gemm_batch_size = workspaceInfo.getGateBlockSize()[1]; return PropX(handle, runtimeArgs, 0, gemm_batch_size); } -void RNNForwardDataModularAlgo::PropX(const Handle& handle, +void RNNForwardDataModularAlgo::PropX(Handle& handle, const runtimeArgsFwd& runtimeArgs, size_t gemm_batch_offset, size_t gemm_batch_size) const @@ -188,8 +188,7 @@ void RNNForwardDataModularAlgo::PropHxCx(const Handle& handle, } } -void RNNForwardDataModularAlgo::AddBias(const Handle& handle, - const runtimeArgsFwd& runtimeArgs) const +void RNNForwardDataModularAlgo::AddBias(Handle& handle, const runtimeArgsFwd& runtimeArgs) const { if(rnnDesc.biasMode == miopenRNNNoBias) return; diff --git a/src/rnn/Solutions/bwd_multi_stream.cpp b/src/rnn/Solutions/bwd_multi_stream.cpp index 964c8d50fa..00f3dddc57 100644 --- a/src/rnn/Solutions/bwd_multi_stream.cpp +++ b/src/rnn/Solutions/bwd_multi_stream.cpp @@ -172,7 +172,7 @@ bool RNNModularMultiStreamBWD::ChunkDispatch(const runtimeArgsBwd& args, size_t chunk_layer_offset) const { constexpr auto seq_dir = rnn_base::SequenceDirection::Forward; - const Handle& handle = *args.handle; + Handle& handle = *args.handle; if(chunk_time_offset >= max_seq_len) return false; diff --git a/src/rnn/Solutions/bww_multi_stream.cpp b/src/rnn/Solutions/bww_multi_stream.cpp index 1f480afdea..f77ce128c3 100644 --- a/src/rnn/Solutions/bww_multi_stream.cpp +++ b/src/rnn/Solutions/bww_multi_stream.cpp @@ -38,7 +38,7 @@ void RNNModularMultiStreamBWWeights::PrologueDispatch(const runtimeArgsBww& args rnnAlgoModules.PrepareWriteBuffers(*args.handle, args.dw); } -void RNNModularMultiStreamBWWeights::Compute(const Handle& handle, +void RNNModularMultiStreamBWWeights::Compute(Handle& handle, ConstData_t x, ConstData_t hx, Data_t dw, diff --git a/src/rnn/Solutions/bww_s_steam.cpp b/src/rnn/Solutions/bww_s_steam.cpp index 736d8cfde3..fa81015f12 100644 --- a/src/rnn/Solutions/bww_s_steam.cpp +++ b/src/rnn/Solutions/bww_s_steam.cpp @@ -29,7 +29,7 @@ namespace miopen { namespace rnn_base { -void RNNModularSingleStreamBWWeights::Compute(const Handle& handle, +void RNNModularSingleStreamBWWeights::Compute(Handle& handle, ConstData_t x, ConstData_t hx, Data_t dw, diff --git a/src/tensor.cpp b/src/tensor.cpp index c1bd709267..bc2efae3bf 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -26,12 +26,19 @@ #include #include +#include #include #include #include +#include +#include #include #include #include +#include +#include + +#include #include @@ -872,21 +879,1162 @@ void from_json(const nlohmann::json& j, TensorDescriptor& descriptor) j.at("type").get_to(descriptor.type); } -void OpTensor2(Handle& handle, - miopenTensorOp_t tensorOp, - const void* alpha0, - const TensorDescriptor& aTensorDesc, - ConstData_t ATensor, - const void* alpha1, - const TensorDescriptor& bTensorDesc, - ConstData_t BTensor, - const void* beta, - const TensorDescriptor& cTensorDesc, - Data_t CTensor, - const size_t Aoffset, - const size_t Boffset, - const size_t Coffset, - bool nonStandardSquash) +TensorDescriptor GetFlattenedTensorDescriptor(const TensorDescriptor& desc) +{ + // is packed + if(desc.IsPacked()) + return {desc.GetType(), {desc.GetElementSize()}, {static_cast(1)}}; + + // start flattening tensor + std::vector flat_lengths; + std::vector flat_strides; + + auto non1_length_strides = boost::combine(desc.GetLengths(), desc.GetStrides()) | + boost::adaptors::filtered(f_length_is_not_1_t()); + + auto i = non1_length_strides.begin(); + std::size_t flat_len = boost::get<0>(*i); + auto i_previous = i++; + + // the 0-th dimension full-length doesn't matter + for(; i != non1_length_strides.end(); ++i) + { + std::size_t len = boost::get<0>(*i); + std::size_t stride = boost::get<1>(*i); + std::size_t previous_stride = boost::get<1>(*i_previous); + std::size_t full_len = previous_stride / stride; + + if(len == full_len) + { + flat_len *= len; + } + else + { + flat_lengths.push_back(flat_len); + flat_strides.push_back(previous_stride); + flat_len = len; + } + i_previous = i; + } + flat_lengths.push_back(flat_len); + flat_strides.push_back(boost::get<1>(*i_previous)); + + return {desc.GetType(), flat_lengths, flat_strides}; +} + +struct two_exp_ceiling_t +{ + std::size_t operator()(std::size_t n) const + { + assert(n > 0); + + std::size_t i = 1; + + n--; + while(n != 0) + { + i *= 2; + n /= 2; + } + + return i; + } +}; + +static std::vector get_worker_sizes(const std::vector& data_sizes) +{ + const std::size_t dim = data_sizes.size(); + + std::vector worker_sizes(dim); + + std::transform(data_sizes.begin(), data_sizes.end(), worker_sizes.begin(), two_exp_ceiling_t{}); + + std::size_t wgd = std::accumulate( + worker_sizes.begin(), worker_sizes.end(), std::size_t{1}, std::multiplies()); + + if(wgd > 65536) + { + std::size_t n = wgd / 65536; + + int i = 0; + while(n > 1 && i < dim) + { + std::size_t size_old = worker_sizes[i]; + worker_sizes[i] = (size_old - 1) / n + 1; + n /= size_old / worker_sizes[i]; + ++i; + } + } + + return worker_sizes; +} + +void SetTensor(const Handle& handle, + const TensorDescriptor& yDesc, + Data_t y, + const void* alpha, + const int offset) +{ + if(y == nullptr || alpha == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + const TensorDescriptor yDesc_flat = GetFlattenedTensorDescriptor(yDesc); + +#ifndef NDEBUG + if(yDesc.GetNumDims() != yDesc_flat.GetNumDims()) + { + MIOPEN_LOG_I2("real descriptor: " << yDesc); + MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat); + } +#endif + + const std::size_t yDim_flat = yDesc_flat.GetNumDims(); + + assert(yDim_flat > 0 && yDim_flat <= 5); + + std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d"; + + const miopenDataType_t dataType = yDesc_flat.GetType(); + + std::string network_config = "set " + std::to_string(dataType); + for(auto& len : yDesc_flat.GetLengths()) + { + network_config += " " + std::to_string(len); + } + + auto&& kernels = handle.GetKernels(kernel_name, network_config); + + KernelInvoke kernel; + + if(!kernels.empty()) + { + kernel = kernels.front(); + } + else + { + std::string program_name = "MIOpenSubTensorOpWithScalarKernel.cl"; + + std::vector worker_sizes = get_worker_sizes(yDesc_flat.GetLengths()); + + std::size_t wgd = std::accumulate(worker_sizes.begin(), + worker_sizes.end(), + std::size_t{1}, + std::multiplies()); + + std::size_t wld = 256 < wgd ? 256 : wgd; + std::stringstream ss; + ss << "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET" + << GetDataTypeKernelParams(dataType); + for(int i = 0; i < yDim_flat; ++i) + { + ss << " -DWORK_LENGTH_" << std::to_string(i) << "=" << std::to_string(worker_sizes[i]); + } + + kernel = handle.AddKernel(kernel_name, + network_config, + program_name, + kernel_name, + {wld, 1, 1}, + {wgd, 1, 1}, + ss.str()); + } + + switch(yDim_flat) + { + case 1: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetLengths()[0])); + }); + + break; + } + case 2: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1])); + }); + + break; + } + case 3: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1]), + static_cast(yDesc_flat.GetLengths()[2])); + }); + + break; + } + case 4: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetStrides()[3]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1]), + static_cast(yDesc_flat.GetLengths()[2]), + static_cast(yDesc_flat.GetLengths()[3])); + }); + + break; + } + case 5: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetStrides()[3]), + static_cast(yDesc_flat.GetStrides()[4]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1]), + static_cast(yDesc_flat.GetLengths()[2]), + static_cast(yDesc_flat.GetLengths()[3]), + static_cast(yDesc_flat.GetLengths()[4])); + }); + + break; + } + default: assert(false); + } +} + +void ScaleTensor(const Handle& handle, + const TensorDescriptor& yDesc, + Data_t y, + const void* alpha, + const int offset) +{ + if(y == nullptr || alpha == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + const TensorDescriptor yDesc_flat = GetFlattenedTensorDescriptor(yDesc); + +#ifndef NDEBUG + if(yDesc.GetNumDims() != yDesc_flat.GetNumDims()) + { + MIOPEN_LOG_I2("real descriptor: " << yDesc); + MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat); + } +#endif + + const std::size_t yDim_flat = yDesc_flat.GetNumDims(); + + assert(yDim_flat > 0 && yDim_flat <= 5); + + const miopenDataType_t dataType = yDesc_flat.GetType(); + + if(!(dataType == miopenHalf // + || dataType == miopenFloat // + || dataType == miopenInt32 // + || dataType == miopenDouble)) + { + MIOPEN_THROW(miopenStatusBadParm, "ScaleTensor: unsupported data type."); + } + + std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d"; + + const std::vector& lens = yDesc_flat.GetLengths(); + + std::string network_config = "scale " + std::to_string(yDesc_flat.GetType()); + for(auto& len : lens) + { + network_config += " " + std::to_string(len); + } + + auto&& kernels = handle.GetKernels(kernel_name, network_config); + + KernelInvoke kernel; + + if(!kernels.empty()) + { + kernel = kernels.front(); + } + else + { + std::string program_name = "MIOpenSubTensorOpWithScalarKernel.cl"; + + std::vector worker_sizes = get_worker_sizes(lens); + + std::size_t wgd = std::accumulate(worker_sizes.begin(), + worker_sizes.end(), + std::size_t{1}, + std::multiplies()); + + std::size_t wld = 256 < wgd ? 256 : wgd; + + std::string parms = "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_MULTIPLY" + + GetDataTypeKernelParams(dataType); + for(int i = 0; i < yDim_flat; ++i) + { + parms += " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]); + } + + kernel = handle.AddKernel(kernel_name, + network_config, + program_name, + kernel_name, + {wld, 1, 1}, + {wgd, 1, 1}, + parms); + } + + switch(yDim_flat) + { + case 1: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetLengths()[0])); + }); + + break; + } + case 2: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1])); + }); + + break; + } + case 3: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1]), + static_cast(yDesc_flat.GetLengths()[2])); + }); + + break; + } + case 4: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetStrides()[3]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1]), + static_cast(yDesc_flat.GetLengths()[2]), + static_cast(yDesc_flat.GetLengths()[3])); + }); + + break; + } + case 5: { + visit_float(dataType, [&](auto as_float) { + kernel(y, + *as_float(alpha), + offset, + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetStrides()[3]), + static_cast(yDesc_flat.GetStrides()[4]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1]), + static_cast(yDesc_flat.GetLengths()[2]), + static_cast(yDesc_flat.GetLengths()[3]), + static_cast(yDesc_flat.GetLengths()[4])); + }); + + break; + } + default: assert(false); + } +} + +void CopyTensor(const Handle& handle, + const TensorDescriptor& srcDesc, + ConstData_t src, + const TensorDescriptor& dstDesc, + Data_t dst, + int srcOffset, + int dstOffset, + bool forseAsync) +{ + if(src == nullptr || dst == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor."); + } + + if(srcDesc.GetType() != dstDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match."); + } + + if(srcDesc.GetLengths() != dstDesc.GetLengths()) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match."); + } + + auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc); + const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors); + const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors); + +#ifndef NDEBUG + if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims()) + { + MIOPEN_LOG_I2("src real descriptor: " << srcDesc); + MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat); + MIOPEN_LOG_I2("dst real descriptor: " << dstDesc); + MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat); + } +#endif + + std::size_t srcDim_flat = srcDesc_flat.GetNumDims(); + + if(srcDim_flat < 1 || srcDim_flat > 5) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported."); + } + + if(forseAsync || srcOffset > 0 || dstOffset > 0 || + (!(srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked()))) + { + std::string kernel_name = "SubTensorOpWithSubTensor" + std::to_string(srcDim_flat) + "d"; + + const std::vector& lens = srcDesc_flat.GetLengths(); + + std::string network_config = "copy " + std::to_string(srcDesc_flat.GetType()); + for(auto& len : lens) + { + network_config += " " + std::to_string(len); + } + + auto&& kernels = handle.GetKernels(kernel_name, network_config); + + KernelInvoke kernel; + + if(!kernels.empty()) + { + kernel = kernels.front(); + } + else + { + std::string program_name = "MIOpenSubTensorOpWithSubTensorKernel.cl"; + + std::vector worker_sizes = get_worker_sizes(lens); + + std::size_t wgd = std::accumulate(worker_sizes.begin(), + worker_sizes.end(), + std::size_t{1}, + std::multiplies()); + + std::size_t wld = 256 < wgd ? 256 : wgd; + + std::string parms = "-DSUBTENSOR_OP_WITH_SUBTENSOR=SUBTENSOR_OP_WITH_SUBTENSOR_COPY" + + GetDataTypeKernelParams(srcDesc_flat.GetType()); + for(std::size_t i = 0; i < srcDim_flat; ++i) + { + parms += + " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]); + } + + kernel = handle.AddKernel(kernel_name, + network_config, + program_name, + kernel_name, + {wld, 1, 1}, + {wgd, 1, 1}, + parms); + } + + switch(srcDim_flat) + { + case 1: { + kernel(src, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetLengths()[0]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0])); + + break; + } + case 2: { + kernel(src, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetStrides()[1]), + static_cast(srcDesc_flat.GetLengths()[0]), + static_cast(srcDesc_flat.GetLengths()[1]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0]), + static_cast(dstDesc_flat.GetStrides()[1])); + + break; + } + case 3: { + kernel(src, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetStrides()[1]), + static_cast(srcDesc_flat.GetStrides()[2]), + static_cast(srcDesc_flat.GetLengths()[0]), + static_cast(srcDesc_flat.GetLengths()[1]), + static_cast(srcDesc_flat.GetLengths()[2]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0]), + static_cast(dstDesc_flat.GetStrides()[1]), + static_cast(dstDesc_flat.GetStrides()[2])); + + break; + } + case 4: { + kernel(src, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetStrides()[1]), + static_cast(srcDesc_flat.GetStrides()[2]), + static_cast(srcDesc_flat.GetStrides()[3]), + static_cast(srcDesc_flat.GetLengths()[0]), + static_cast(srcDesc_flat.GetLengths()[1]), + static_cast(srcDesc_flat.GetLengths()[2]), + static_cast(srcDesc_flat.GetLengths()[3]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0]), + static_cast(dstDesc_flat.GetStrides()[1]), + static_cast(dstDesc_flat.GetStrides()[2]), + static_cast(dstDesc_flat.GetStrides()[3])); + + break; + } + case 5: { + kernel(src, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetStrides()[1]), + static_cast(srcDesc_flat.GetStrides()[2]), + static_cast(srcDesc_flat.GetStrides()[3]), + static_cast(srcDesc_flat.GetStrides()[4]), + static_cast(srcDesc_flat.GetLengths()[0]), + static_cast(srcDesc_flat.GetLengths()[1]), + static_cast(srcDesc_flat.GetLengths()[2]), + static_cast(srcDesc_flat.GetLengths()[3]), + static_cast(srcDesc_flat.GetLengths()[4]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0]), + static_cast(dstDesc_flat.GetStrides()[1]), + static_cast(dstDesc_flat.GetStrides()[2]), + static_cast(dstDesc_flat.GetStrides()[3]), + static_cast(dstDesc_flat.GetStrides()[4])); + + break; + } + default: assert(false); + } + } + else + { + handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType())); + } +} + +std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, miopenDataType_t type) +{ + std::string option(buildOption); + switch(type) + { + case miopenInt8: return option += "0"; + case miopenInt32: return option += "1"; + case miopenHalf: return option += "2"; + case miopenFloat: return option += "3"; + case miopenBFloat16: return option += "4"; + case miopenFloat8: + MIOPEN_THROW(miopenStatusBadParm, "miopenFloat8 data type not supported in cast tensor."); + case miopenBFloat8: + MIOPEN_THROW(miopenStatusBadParm, "miopenBFloat8 data type not supported in cast tensor."); + case miopenDouble: + // TODO + MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor."); + case miopenInt64: + MIOPEN_THROW(miopenStatusBadParm, "miopenInt64 data type not supported in cast tensor."); + default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc."); + } +} + +void CastTensor(const Handle& handle, + const void* alpha, + const bool clamping, + const TensorDescriptor& srcDesc, + ConstData_t src, + const TensorDescriptor& dstDesc, + Data_t dst, + int srcOffset, + int dstOffset) +{ + if(src == nullptr || dst == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor."); + } + + if(srcDesc.GetLengths() != dstDesc.GetLengths()) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match."); + } + + auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc); + const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors); + const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors); + +#ifndef NDEBUG + if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims()) + { + MIOPEN_LOG_I2("src real descriptor: " << srcDesc); + MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat); + MIOPEN_LOG_I2("dst real descriptor: " << dstDesc); + MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat); + } +#endif + + std::size_t srcDim_flat = srcDesc_flat.GetNumDims(); + + if(srcDim_flat < 1 || srcDim_flat > 5) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported."); + } + + if(srcDesc.GetType() == dstDesc.GetType() && srcOffset == 0 && dstOffset == 0 && + srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked()) + { + handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType())); + } + else + { + std::string kernel_name = "SubTensorOpWithCastTensor" + std::to_string(srcDim_flat) + "d"; + + const std::vector& lens = srcDesc_flat.GetLengths(); + + std::string network_config = "cast " + std::to_string(dstDesc_flat.GetType()); + for(auto& len : lens) + { + network_config += " " + std::to_string(len); + } + + auto&& kernels = handle.GetKernels(kernel_name, network_config); + KernelInvoke kernel; + + auto miopen_alpha = *(static_cast(alpha)); + + if(!kernels.empty()) + { + kernel = kernels.front(); + } + else + { + std::string program_name = "MIOpenSubTensorOpWithCastTensorKernel.cl"; + + std::vector worker_sizes = get_worker_sizes(lens); + + std::size_t wgd = std::accumulate(worker_sizes.begin(), + worker_sizes.end(), + std::size_t{1}, + std::multiplies()); + + std::size_t wld = 256 < wgd ? 256 : wgd; + + std::string parms = + GetCastTensorBuildOptionFromType(" -DMIOPEN_SRC_TYPE=", srcDesc_flat.GetType()) + + GetCastTensorBuildOptionFromType(" -DMIOPEN_DST_TYPE=", dstDesc_flat.GetType()); + + for(std::size_t i = 0; i < srcDim_flat; ++i) + { + parms += + " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]); + } + + if(dstDesc_flat.GetType() == miopenBFloat16) + { + parms += " -DMIOPEN_USE_RNE_BFLOAT16=1"; + } + + kernel = handle.AddKernel(kernel_name, + network_config, + program_name, + kernel_name, + {wld, 1, 1}, + {wgd, 1, 1}, + parms); + } + + const int clamping_arg = clamping ? 1 : 0; + switch(srcDim_flat) + { + case 1: { + kernel(src, + miopen_alpha, + clamping_arg, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetLengths()[0]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0])); + + break; + } + case 2: { + kernel(src, + miopen_alpha, + clamping_arg, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetStrides()[1]), + static_cast(srcDesc_flat.GetLengths()[0]), + static_cast(srcDesc_flat.GetLengths()[1]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0]), + static_cast(dstDesc_flat.GetStrides()[1])); + + break; + } + case 3: { + kernel(src, + miopen_alpha, + clamping_arg, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetStrides()[1]), + static_cast(srcDesc_flat.GetStrides()[2]), + static_cast(srcDesc_flat.GetLengths()[0]), + static_cast(srcDesc_flat.GetLengths()[1]), + static_cast(srcDesc_flat.GetLengths()[2]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0]), + static_cast(dstDesc_flat.GetStrides()[1]), + static_cast(dstDesc_flat.GetStrides()[2])); + + break; + } + case 4: { + kernel(src, + miopen_alpha, + clamping_arg, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetStrides()[1]), + static_cast(srcDesc_flat.GetStrides()[2]), + static_cast(srcDesc_flat.GetStrides()[3]), + static_cast(srcDesc_flat.GetLengths()[0]), + static_cast(srcDesc_flat.GetLengths()[1]), + static_cast(srcDesc_flat.GetLengths()[2]), + static_cast(srcDesc_flat.GetLengths()[3]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0]), + static_cast(dstDesc_flat.GetStrides()[1]), + static_cast(dstDesc_flat.GetStrides()[2]), + static_cast(dstDesc_flat.GetStrides()[3])); + + break; + } + case 5: { + kernel(src, + miopen_alpha, + clamping_arg, + srcOffset, + static_cast(srcDesc_flat.GetStrides()[0]), + static_cast(srcDesc_flat.GetStrides()[1]), + static_cast(srcDesc_flat.GetStrides()[2]), + static_cast(srcDesc_flat.GetStrides()[3]), + static_cast(srcDesc_flat.GetStrides()[4]), + static_cast(srcDesc_flat.GetLengths()[0]), + static_cast(srcDesc_flat.GetLengths()[1]), + static_cast(srcDesc_flat.GetLengths()[2]), + static_cast(srcDesc_flat.GetLengths()[3]), + static_cast(srcDesc_flat.GetLengths()[4]), + dst, + dstOffset, + static_cast(dstDesc_flat.GetStrides()[0]), + static_cast(dstDesc_flat.GetStrides()[1]), + static_cast(dstDesc_flat.GetStrides()[2]), + static_cast(dstDesc_flat.GetStrides()[3]), + static_cast(dstDesc_flat.GetStrides()[4])); + + break; + } + default: assert(false); + } + } +} + +void TransformTensor(const Handle& handle, + const void* alpha, + const TensorDescriptor& xDesc, + ConstData_t x, + const void* beta, + const TensorDescriptor& yDesc, + Data_t y, + size_t Xoffset, + size_t Yoffset) +{ + if(x == nullptr || y == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + if(alpha == nullptr || beta == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + auto x_len = xDesc.GetLengths(); + auto y_len = yDesc.GetLengths(); + + if(x_len.size() != y_len.size()) + { + MIOPEN_THROW("Tensor dimension must be the same"); + } + + if(x_len[0] != y_len[0]) + { + MIOPEN_THROW("Tensor x and y batch sizes do not match"); + } + + const auto is_alpha_one = float_equal(*(static_cast(alpha)), 1); + const auto is_beta_zero = float_equal(*(static_cast(beta)), 0); + + if(xDesc.GetType() == miopenInt8 && yDesc.GetType() == miopenInt8 && x_len.size() >= 3) + { + if(x_len[1] <= y_len[1]) + { + if(x_len[1] <= (y_len[1] - 4) || y_len[1] % 4 != 0) + { + MIOPEN_THROW("Invalid y channel size"); + } + + int8_t zero = 0; + SetTensor(handle, yDesc, y, &zero); + } + else if(x_len[1] % 4 != 0) + { + MIOPEN_THROW("Invalid x channel size"); + } + + size_t batch_n = x_len[0]; + + x_len[0] = 1; + y_len[0] = 1; + + miopen::TensorDescriptor x_batch_desc, y_batch_desc; + x_batch_desc = miopen::TensorDescriptor(miopenInt8, x_len); + y_batch_desc = miopen::TensorDescriptor(miopenInt8, y_len); + + size_t x_batch_sz = x_batch_desc.GetElementSize(); + size_t y_batch_sz = y_batch_desc.GetElementSize(); + + for(size_t i = 0; i < batch_n; i++) + { + size_t x_offset = i * x_batch_sz; + size_t y_offset = i * y_batch_sz; + + if(is_alpha_one && is_beta_zero) + { + CopyTensor(handle, + ((x_len[1] <= y_len[1]) ? x_batch_desc : y_batch_desc), + x, + ((x_len[1] <= y_len[1]) ? x_batch_desc : y_batch_desc), + y, + x_offset, + y_offset); + } + else + { + MIOPEN_THROW(miopenStatusNotImplemented, + "y=alpha*x+beta*y is not supported for int8 yet"); + } + } + } + else + { + auto x_y_len = boost::combine(x_len, y_len); + bool same_spatial_len = std::all_of(x_y_len.begin(), x_y_len.end(), [](auto v) { + return boost::get<0>(v) == boost::get<1>(v); + }); + + if(!same_spatial_len) + { + MIOPEN_THROW("Tensor x and y spatial sizes do not match"); + } + + auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(xDesc, yDesc); + const TensorDescriptor& xDesc_flat = std::get<0>(flat_descriptors); + const TensorDescriptor& yDesc_flat = std::get<1>(flat_descriptors); + + if(xDesc.GetNumDims() != xDesc_flat.GetNumDims()) + { + MIOPEN_LOG_I2("x real descriptor: " << xDesc); + MIOPEN_LOG_I2("x flat descriptor: " << xDesc_flat); + } + + if(yDesc.GetNumDims() != yDesc_flat.GetNumDims()) + { + MIOPEN_LOG_I2("y real descriptor: " << yDesc); + MIOPEN_LOG_I2("y flat descriptor: " << yDesc_flat); + } + + const std::size_t yDim_flat = yDesc_flat.GetNumDims(); + + assert(yDim_flat > 0 && yDim_flat <= 5); + + const miopenDataType_t dataTypex = xDesc_flat.GetType(); + const miopenDataType_t dataTypey = yDesc_flat.GetType(); + + if(!(dataTypex == miopenHalf // + || dataTypex == miopenFloat // + || dataTypex == miopenInt32 // + || dataTypex == miopenBFloat16 // + || dataTypex == miopenDouble)) + { + MIOPEN_THROW("Tensor x is a unsupported data type"); + } + + if(!(dataTypey == miopenHalf // + || dataTypey == miopenFloat // + || dataTypey == miopenInt32 // + || dataTypey == miopenBFloat16 // + || dataTypey == miopenDouble)) + { + MIOPEN_THROW("Tensor y is a unsupported data type"); + } + + if(dataTypex != dataTypey) + { + MIOPEN_THROW("Tensor x and y have different data types"); + } + + std::string kernel_name = "SubTensorOpWithTransform" + std::to_string(yDim_flat) + "d"; + + const std::vector& lens = yDesc_flat.GetLengths(); + + std::string network_config = "transform " + std::to_string(yDesc_flat.GetType()); + for(auto& len : lens) + { + network_config += "x" + std::to_string(len); + } + + if(is_beta_zero) + network_config += "xBETA_IS_ZERO"; + if(is_alpha_one) + network_config += "xALPHA_IS_ONE"; + + auto&& kernels = handle.GetKernels(kernel_name, network_config); + + KernelInvoke kernel; + + if(!kernels.empty()) + { + kernel = kernels.front(); + } + else + { + std::string program_name = "MIOpenSubTensorOpWithTransformKernel.cl"; + + std::vector worker_sizes = get_worker_sizes(lens); + + std::size_t wgd = std::accumulate(worker_sizes.begin(), + worker_sizes.end(), + std::size_t{1}, + std::multiplies()); + + std::size_t wld = 256 < wgd ? 256 : wgd; + + std::string parms = + GetDataTypeKernelParams(dataTypey) // + + " -DMIOPEN_BETA_IS_ZERO=" + std::to_string(static_cast(is_beta_zero)) // + + " -DMIOPEN_ALPHA_IS_ONE=" + std::to_string(static_cast(is_alpha_one)); + + for(int i = 0; i < yDim_flat; ++i) + { + parms += + " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]); + } + + kernel = handle.AddKernel(kernel_name, + network_config, + program_name, + kernel_name, + {wld, 1, 1}, + {wgd, 1, 1}, + parms); + } + + switch(yDim_flat) + { + case 1: { + visit_float(dataTypey, [&](auto as_float) { + kernel(x, + *as_float(alpha), + y, + *as_float(beta), + static_cast(Xoffset), + static_cast(Yoffset), + static_cast(xDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetLengths()[0])); + }); + + break; + } + case 2: { + visit_float(dataTypey, [&](auto as_float) { + kernel(x, + *as_float(alpha), + y, + *as_float(beta), + static_cast(Xoffset), + static_cast(Yoffset), + static_cast(xDesc_flat.GetStrides()[0]), + static_cast(xDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1])); + }); + + break; + } + case 3: { + visit_float(dataTypey, [&](auto as_float) { + kernel(x, + *as_float(alpha), + y, + *as_float(beta), + static_cast(Xoffset), + static_cast(Yoffset), + static_cast(xDesc_flat.GetStrides()[0]), + static_cast(xDesc_flat.GetStrides()[1]), + static_cast(xDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1]), + static_cast(yDesc_flat.GetLengths()[2])); + }); + + break; + } + case 4: { + visit_float(dataTypey, [&](auto as_float) { + kernel(x, + *as_float(alpha), + y, + *as_float(beta), + static_cast(Xoffset), + static_cast(Yoffset), + static_cast(xDesc_flat.GetStrides()[0]), + static_cast(xDesc_flat.GetStrides()[1]), + static_cast(xDesc_flat.GetStrides()[2]), + static_cast(xDesc_flat.GetStrides()[3]), + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetStrides()[3]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1]), + static_cast(yDesc_flat.GetLengths()[2]), + static_cast(yDesc_flat.GetLengths()[3])); + }); + + break; + } + case 5: { + visit_float(dataTypey, [&](auto as_float) { + kernel(x, + *as_float(alpha), + y, + *as_float(beta), + static_cast(Xoffset), + static_cast(Yoffset), + static_cast(xDesc_flat.GetStrides()[0]), + static_cast(xDesc_flat.GetStrides()[1]), + static_cast(xDesc_flat.GetStrides()[2]), + static_cast(xDesc_flat.GetStrides()[3]), + static_cast(xDesc_flat.GetStrides()[4]), + static_cast(yDesc_flat.GetStrides()[0]), + static_cast(yDesc_flat.GetStrides()[1]), + static_cast(yDesc_flat.GetStrides()[2]), + static_cast(yDesc_flat.GetStrides()[3]), + static_cast(yDesc_flat.GetStrides()[4]), + static_cast(yDesc_flat.GetLengths()[0]), + static_cast(yDesc_flat.GetLengths()[1]), + static_cast(yDesc_flat.GetLengths()[2]), + static_cast(yDesc_flat.GetLengths()[3]), + static_cast(yDesc_flat.GetLengths()[4])); + }); + + break; + } + default: assert(false); + } + } +} + +void OpTensor(Handle& handle, + miopenTensorOp_t tensorOp, + const void* alpha0, + const TensorDescriptor& aTensorDesc, + ConstData_t ATensor, + const void* alpha1, + const TensorDescriptor& bTensorDesc, + ConstData_t BTensor, + const void* beta, + const TensorDescriptor& cTensorDesc, + Data_t CTensor, + const size_t Aoffset, + const size_t Boffset, + const size_t Coffset, + bool nonStandardSquash) { if(ATensor == nullptr || BTensor == nullptr || CTensor == nullptr) { diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp index 1df83044b2..3121715e8a 100644 --- a/test/tensor_ops.cpp +++ b/test/tensor_ops.cpp @@ -181,24 +181,24 @@ struct verify_tensor_ops auto a_dev = handle.Write(a.data); auto b_dev = handle.Write(b.data); - miopen::OpTensor2(handle, - // miopenTensorOpAdd, - // miopenTensorOpMax, - // miopenTensorOpMin, - miopenTensorOpMul, - &alpha0, - a.desc, - a_dev.get(), - &alpha1, - b.desc, - b_dev.get(), - &beta, - c.desc, - c_dev.get(), - Aoffset, - Boffset, - Coffset, - false); // it does not verify non-standard behaviour + miopen::OpTensor(handle, + // miopenTensorOpAdd, + // miopenTensorOpMax, + // miopenTensorOpMin, + miopenTensorOpMul, + &alpha0, + a.desc, + a_dev.get(), + &alpha1, + b.desc, + b_dev.get(), + &beta, + c.desc, + c_dev.get(), + Aoffset, + Boffset, + Coffset, + false); // it does not verify non-standard behaviour if(not no_validate) { From 155b35fae50ba2f816203db49ae6cb3d3866483c Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Wed, 20 Nov 2024 21:04:09 +0200 Subject: [PATCH 22/25] code tidying --- src/solver/tensorOp/Op1dTensorGeneric.cpp | 1 - src/solver/tensorOp/Op2dTensorGeneric.cpp | 1 - src/solver/tensorOp/Op3dTensorGeneric.cpp | 9 ++++----- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp index 896d75d50c..341e074d89 100644 --- a/src/solver/tensorOp/Op1dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp @@ -89,7 +89,6 @@ Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, size_t max_num_wg = 4096; auto num_wg = std::clamp(c_n / local_threads, size_t(1), size_t(max_num_wg)); - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; size_t global_threads = num_wg * local_threads; const std::array vld{local_threads, 1, 1}; diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp index 41fca78068..35c9629ba7 100644 --- a/src/solver/tensorOp/Op2dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp @@ -93,7 +93,6 @@ Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, size_t max_num_wg = 4096; auto num_wg = std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg)); - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; size_t global_threads = num_wg * local_threads; const std::array vld{local_threads, 1, 1}; diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp index 2bafc6abaa..782eb1804f 100644 --- a/src/solver/tensorOp/Op3dTensorGeneric.cpp +++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp @@ -86,12 +86,11 @@ Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context, miopenDataType_t data_type = bTensorDesc.GetType(); - auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens); + size_t local_threads = 32; + size_t max_num_wg = 4096; - int max_num_wg = 4096; - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; - - size_t local_threads = 256; + auto num_wg = + std::clamp((clens[0] * clens[1] * clens[2]) / local_threads, size_t(1), size_t(max_num_wg)); size_t global_threads = num_wg * local_threads; const std::array vld{local_threads, 1, 1}; From 0b3454c678ee60ad7605d324c3979af8310193b2 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Mon, 25 Nov 2024 17:00:24 +0200 Subject: [PATCH 23/25] unit test for tensorOp PD + additional changes requested --- src/CMakeLists.txt | 6 +- src/solver/tensorOp/tensor_op_helpers.hpp | 5 - .../unit_tensorOp_ProblemDescription.cpp | 200 ++++++++++++++++++ 3 files changed, 203 insertions(+), 8 deletions(-) create mode 100644 test/gtest/unit_tensorOp_ProblemDescription.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0721efc4f3..c9f1ab511a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -342,11 +342,11 @@ set( MIOpen_Source solver/tensorOp/Op2dTensorLite.cpp solver/tensorOp/Op2dTensorSquash.cpp solver/tensorOp/Op3dTensorGeneric.cpp - solver/tensorOp/OpTensorFwdBias.cpp - solver/tensorOp/Op4dTensorLite.cpp - solver/tensorOp/OpTensorLeadingOnes.cpp solver/tensorOp/Op4dTensorGeneric.cpp + solver/tensorOp/Op4dTensorLite.cpp solver/tensorOp/Op5dTensorGeneric.cpp + solver/tensorOp/OpTensorFwdBias.cpp + solver/tensorOp/OpTensorLeadingOnes.cpp subbuffers.cpp t5layernorm_api.cpp target_properties.cpp diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp index 26a9ac42d0..cf46c6efe8 100644 --- a/src/solver/tensorOp/tensor_op_helpers.hpp +++ b/src/solver/tensorOp/tensor_op_helpers.hpp @@ -193,13 +193,8 @@ Get4dParams(const miopen::tensorOp::ProblemDescription& problem, bool is4dLite) if(is4dLite) { - // for naive tensor ops - const std::string data_type = GetDataType(bTensorDesc.GetType()); - size_t TENS_LEN = cTensorDesc.GetElementSize(); size_t RD_BLCK = (TENS_LEN % 4 == 0) ? 4 : (TENS_LEN % 2 == 0) ? 2 : 1; - const std::string READ_TYPE = - (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK); size_t total_work = std::max(TENS_LEN / RD_BLCK, size_t(1)); size_t grp_sz = (total_work + local_threads - 1) / local_threads; diff --git a/test/gtest/unit_tensorOp_ProblemDescription.cpp b/test/gtest/unit_tensorOp_ProblemDescription.cpp new file mode 100644 index 0000000000..1b02382881 --- /dev/null +++ b/test/gtest/unit_tensorOp_ProblemDescription.cpp @@ -0,0 +1,200 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include "unit_TensorDescriptor.hpp" +#include + +namespace { + +struct TensorOpProblemDescriptionTestCase +{ + miopenTensorOp_t tensorOp; + float beta; + miopen::unit_tests::TensorDescriptorParams aTensorDesc; + miopen::unit_tests::TensorDescriptorParams bTensorDesc; + miopen::unit_tests::TensorDescriptorParams cTensorDesc; + bool nonStandardSquash; + bool isOk; + + friend std::ostream& operator<<(std::ostream& os, const TensorOpProblemDescriptionTestCase& tc) + { + std::string op; + switch(tc.tensorOp) + { + case miopenTensorOpAdd: op.append("miopenTensorOpAdd"); break; + case miopenTensorOpMul: op.append("miopenTensorOpMul"); break; + case miopenTensorOpMin: op.append("miopenTensorOpMin"); break; + case miopenTensorOpMax: op.append("miopenTensorOpMax"); break; + + default: break; + } + + os << "(" << tc.aTensorDesc << "), "; + os << "(" << tc.bTensorDesc << "), "; + os << "(" << tc.cTensorDesc << "), \n"; + os << "(" << op << ") - beta "; + os << std::to_string(tc.beta) << ")\n"; + return os; + } +}; + +class TestTensorOpPD : public ::testing::TestWithParam +{ +public: + static auto GetTestCases() + { + using TestCase = TensorOpProblemDescriptionTestCase; + + return std::vector{ + // clang-format off + // 4D + TestCase{ + miopenTensorOpAdd, // tensorOp + 0.0f, // beta + {miopenHalf, {1, 4, 4, 4}}, // A + {miopenHalf, {1, 4, 4, 4}}, // B + {miopenHalf, {1, 4, 4, 4}}, // C + false, // nonStandardSquash + true // isOk + }, + TestCase{ + miopenTensorOpAdd, // tensorOp + 0.0f, // beta + {miopenHalf, {4, 4, 4}}, // A + {miopenHalf, {1, 1, 4}}, // B + {miopenHalf, {4, 4, 4}}, // C + false, // nonStandardSquash + false // isOk + }, + TestCase{ + miopenTensorOpAdd, // tensorOp + 1.0f, // beta + {miopenHalf, {4, 1, 4}}, // A + {miopenHalf, {1, 1, 4}}, // B + {miopenHalf, {4, 4, 4}}, // C + false, // nonStandardSquash + false // isOk + }, + TestCase{ + miopenTensorOpAdd, // tensorOp + 1.0f, // beta + {miopenHalf, {4, 4, 4}}, // A + {miopenHalf, {1, 1, 4}}, // B + {miopenFloat, {4, 4, 4}}, // C + false, // nonStandardSquash + false // isOk + }, + TestCase{ + miopenTensorOpAdd, // tensorOp + 1.0f, // beta + {miopenHalf, {4, 4, 4, 4, 4, 4}},// A + {miopenHalf, {1, 1, 4}}, // B + {miopenHalf, {4, 4, 4, 4, 4, 4}},// C + false, // nonStandardSquash + false // isOk + }, + TestCase{ + miopenTensorOpAdd, // tensorOp + 1.0f, // beta + {miopenHalf, {4, 4, 4}}, // A + {miopenHalf, {1, 4}}, // B + {miopenHalf, {4, 4, 4}}, // C + false, // nonStandardSquash + false // isOk + }, + TestCase{ + miopenTensorOpAdd, // tensorOp + 1.0f, // beta + {miopenHalf, {4, 4, 4}}, // A + {miopenHalf, {1, 1, 5}}, // B + {miopenHalf, {4, 4, 4}}, // C + false, // nonStandardSquash + false // isOk + }, + TestCase{ + miopenTensorOpAdd, // tensorOp + 1.0f, // beta + {miopenHalf, {4, 4, 4, 4}}, // A + {miopenHalf, {1, 1, 4, 4}}, // B + {miopenHalf, {4, 4, 4, 4}}, // C + true, // nonStandardSquash + false // isOk + }, + TestCase{ + miopenTensorOpAdd, // tensorOp + 1.0f, // beta + {miopenHalf, {1, 4, 2}}, // A + {miopenHalf, {1, 1, 4}}, // B + {miopenHalf, {1, 4, 2}}, // C + true, // nonStandardSquash + false // isOk + } + // clang-format on + }; + } + + void RunTest() + { + const auto p = GetParam(); + + if(p.isOk) + { + const auto pd = + miopen::tensorOp::ProblemDescription{p.tensorOp, + static_cast(&p.beta), + p.aTensorDesc.GetTensorDescriptor(), + p.bTensorDesc.GetTensorDescriptor(), + p.cTensorDesc.GetTensorDescriptor(), + p.nonStandardSquash}; + ASSERT_EQ(pd.GetBeta(), p.beta); + } + else + { + ASSERT_ANY_THROW({ + const auto pd = miopen::tensorOp::ProblemDescription( + p.tensorOp, + miopen::float_equal(p.beta, 0.0) ? nullptr : static_cast(&p.beta), + p.aTensorDesc.GetTensorDescriptor(), + p.bTensorDesc.GetTensorDescriptor(), + p.cTensorDesc.GetTensorDescriptor(), + p.nonStandardSquash); + }); + } + } +}; + +} // namespace + +using CPU_TensorOpProblemDescription_NONE = TestTensorOpPD; + +TEST_P(CPU_TensorOpProblemDescription_NONE, TensorOpProblemDescription) { this->RunTest(); }; + +INSTANTIATE_TEST_SUITE_P(Full, + CPU_TensorOpProblemDescription_NONE, + testing::ValuesIn(TestTensorOpPD::GetTestCases())); From 146070a40f2cd826e070a037a0f4139c44d79044 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Fri, 29 Nov 2024 10:37:27 +0200 Subject: [PATCH 24/25] fix windows build issue --- src/include/miopen/tensorOp/problem_description.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/miopen/tensorOp/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp index dc60a3c7c9..ecbf189b3f 100644 --- a/src/include/miopen/tensorOp/problem_description.hpp +++ b/src/include/miopen/tensorOp/problem_description.hpp @@ -35,7 +35,7 @@ struct NetworkConfig; namespace tensorOp { -struct ProblemDescription : ProblemDescriptionBase +struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase { ProblemDescription(const miopenTensorOp_t tensorOp_, const void* beta_, From 3dc0f66d3b7bc1ed4e2cd57af05c08f2bec8d676 Mon Sep 17 00:00:00 2001 From: novakovicdj Date: Thu, 5 Dec 2024 17:25:04 +0200 Subject: [PATCH 25/25] kept changes in CastTensor but in tensor.cpp file --- src/tensor.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/tensor.cpp b/src/tensor.cpp index 9215a40665..000a5ba87e 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -1538,8 +1538,10 @@ void CastTensor(const Handle& handle, MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension sizes unsupported."); } + auto miopen_alpha = *(static_cast(alpha)); + if(srcDesc.GetType() == dstDesc.GetType() && srcOffset == 0 && dstOffset == 0 && - srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked()) + srcDesc_flat.IsPacked() && dstDesc_flat.IsPacked() && float_equal(miopen_alpha, 1.0)) { handle.Copy(src, dst, srcDesc_flat.GetElementSize() * GetTypeSize(srcDesc_flat.GetType())); } @@ -1549,7 +1551,9 @@ void CastTensor(const Handle& handle, const std::vector& lens = srcDesc_flat.GetLengths(); - std::string network_config = "cast " + std::to_string(dstDesc_flat.GetType()); + // TODO: make proper network config + std::string network_config = "cast " + std::to_string(srcDesc_flat.GetType()) + + std::to_string(dstDesc_flat.GetType()); for(auto& len : lens) { network_config += " " + std::to_string(len); @@ -1558,8 +1562,6 @@ void CastTensor(const Handle& handle, auto&& kernels = handle.GetKernels(kernel_name, network_config); KernelInvoke kernel; - auto miopen_alpha = *(static_cast(alpha)); - if(!kernels.empty()) { kernel = kernels.front();