From 62a05348f511ebd8378ef00596aa2d3d7da80179 Mon Sep 17 00:00:00 2001 From: amberhassaan Date: Sat, 16 Dec 2023 14:34:46 -0500 Subject: [PATCH] Standardize workspace abstraction (#2524) --- src/ocl/convolutionocl.cpp | 21 +++ test/conv_common.hpp | 220 +++++++++--------------- test/ctc.cpp | 8 +- test/find_2_conv.cpp | 23 +-- test/find_db.cpp | 28 ++- test/gru_common.hpp | 104 +++++------ test/gtest/conv_api_strided_tensors.cpp | 22 +-- test/gtest/solver_bwd.hpp | 14 +- test/gtest/solver_fwd.hpp | 14 +- test/gtest/solver_wrw.hpp | 14 +- test/lstm_common.hpp | 173 +++++++++++-------- test/main.cpp | 114 +++--------- test/pooling_common.hpp | 21 +-- test/reduce_test.cpp | 77 ++++----- test/rnn_vanilla_common.hpp | 69 ++++---- test/tensor_reorder.cpp | 91 +++------- test/workspace.hpp | 163 ++++++++++++++++++ 17 files changed, 595 insertions(+), 581 deletions(-) create mode 100644 test/workspace.hpp diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index 73e17380ae..94251cc6d0 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -131,6 +131,18 @@ static inline void ValidateGroupCount(const TensorDescriptor& x, MIOPEN_THROW(miopenStatusBadParm, "Invalid group number"); } +static inline void ValidateWorkspace(Data_t workSpace, const size_t workSpaceSize) +{ + + [[maybe_unused]] bool x = (workSpace != nullptr); + [[maybe_unused]] bool y = (workSpaceSize != 0); + + assert(((x && y) || (!x && !y)) && "workspace pointer and size don't match. Either both should " + "be zero or both should be non-zero"); + + /// \todo could add a check here that workSpace points to GPU memory +} + static Invoker PrepareInvoker(ExecutionContext ctx, const conv::ProblemDescription& problem, const NetworkConfig& config, @@ -260,6 +272,7 @@ void ConvolutionDescriptor::FindConvFwdAlgorithm(Handle& handle, bool exhaustiveSearch) const { MIOPEN_LOG_I("requestAlgoCount = " << requestAlgoCount << ", workspace = " << workSpaceSize); + ValidateWorkspace(workSpace, workSpaceSize); if(x == nullptr || w == nullptr || y == nullptr) MIOPEN_THROW(miopenStatusBadParm, "Buffers cannot be NULL"); if(returnedAlgoCount == nullptr) @@ -495,6 +508,7 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle, size_t workSpaceSize) const { MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize); + ValidateWorkspace(workSpace, workSpaceSize); const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y}; ValidateTensors(tensors); @@ -812,6 +826,7 @@ void ConvolutionDescriptor::ConvolutionForwardImmediate(Handle& handle, const solver::Id solver_id) const { MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize); + ValidateWorkspace(workSpace, workSpaceSize); const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y}; ValidateTensors(tensors); @@ -846,6 +861,7 @@ void ConvolutionDescriptor::FindConvBwdDataAlgorithm(Handle& handle, bool exhaustiveSearch) const { MIOPEN_LOG_I("requestAlgoCount = " << requestAlgoCount << ", workspace = " << workSpaceSize); + ValidateWorkspace(workSpace, workSpaceSize); if(dx == nullptr || w == nullptr || dy == nullptr) MIOPEN_THROW(miopenStatusBadParm, "Buffers cannot be NULL"); if(returnedAlgoCount == nullptr) @@ -944,6 +960,7 @@ void ConvolutionDescriptor::ConvolutionBackwardData(Handle& handle, size_t workSpaceSize) const { MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize); + ValidateWorkspace(workSpace, workSpaceSize); auto tensors = ConvBwdTensors{dyDesc, dy, wDesc, w, dxDesc, dx}; @@ -1015,6 +1032,7 @@ void ConvolutionDescriptor::ConvolutionBackwardImmediate(Handle& handle, solver::Id solver_id) const { MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize); + ValidateWorkspace(workSpace, workSpaceSize); auto tensors = ConvBwdTensors{dyDesc, dy, wDesc, w, dxDesc, dx}; ValidateTensors(tensors); @@ -1055,6 +1073,7 @@ void ConvolutionDescriptor::FindConvBwdWeightsAlgorithm(Handle& handle, bool exhaustiveSearch) const { MIOPEN_LOG_I("requestAlgoCount = " << requestAlgoCount << ", workspace = " << workSpaceSize); + ValidateWorkspace(workSpace, workSpaceSize); if(x == nullptr || dw == nullptr || dy == nullptr) MIOPEN_THROW(miopenStatusBadParm, "Buffers cannot be NULL"); if(returnedAlgoCount == nullptr) @@ -1151,6 +1170,7 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(const Handle& handle, size_t workSpaceSize) const { MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize); + ValidateWorkspace(workSpace, workSpaceSize); decltype(auto) tensors = ConvWrwTensors{dyDesc, dy, xDesc, x, dwDesc, dw}; ValidateTensors(tensors); ValidateAlphaBeta(alpha, beta); @@ -1218,6 +1238,7 @@ void ConvolutionDescriptor::ConvolutionWrwImmediate(Handle& handle, solver::Id solver_id) const { MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize); + ValidateWorkspace(workSpace, workSpaceSize); auto tensors = ConvWrwTensors{dyDesc, dy, xDesc, x, dwDesc, dw}; ValidateTensors(tensors); diff --git a/test/conv_common.hpp b/test/conv_common.hpp index efaf4d8f0a..e387e98b5b 100644 --- a/test/conv_common.hpp +++ b/test/conv_common.hpp @@ -25,6 +25,7 @@ *******************************************************************************/ #pragma once #include "test.hpp" +#include "workspace.hpp" #include #include #include @@ -330,13 +331,11 @@ struct conv_base EXPECT_EQUAL(miopenStatusSuccess, miopenGetSolutionWorkspaceSize(solution, &workspace_size)); - const auto workspace_dev = workspace_size != 0 - ? get_handle().Write(std::vector(workspace_size)) - : nullptr; + Workspace wspace{workspace_size}; - EXPECT_EQUAL(miopenStatusSuccess, - miopenRunSolution( - handle, solution, 3, arguments, workspace_dev.get(), workspace_size)); + EXPECT_EQUAL( + miopenStatusSuccess, + miopenRunSolution(handle, solution, 3, arguments, wspace.ptr(), wspace.size())); } const auto& solution_deref = miopen::deref(solutions.front()); @@ -596,19 +595,6 @@ struct verify_forward_conv : conv_base return rout; } - void resize_workspace(miopen::Handle& h, - const std::size_t sz, - std::vector& ws, - miopen::Allocator::ManageDataPtr& ws_dev) const - { - ws_dev.reset(); - if(sz > 0) - { - ws.resize(sz); - ws_dev = h.Write(ws); - } - } - tensor gpu() { auto&& handle = get_handle(); @@ -629,8 +615,7 @@ struct verify_forward_conv : conv_base bool fallback_path_taken = false; std::size_t count = 0; - std::vector ws; - miopen::Allocator::ManageDataPtr ws_dev = nullptr; + Workspace wspace{}; const auto ctx = ExecutionContext{&handle}; const auto problem = ConvProblemDescription{ @@ -649,8 +634,7 @@ struct verify_forward_conv : conv_base { int ret_algo_count; miopenConvAlgoPerf_t perf; - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - resize_workspace(handle, workspace_size, ws, ws_dev); + wspace.resize(filter.GetWorkSpaceSize(ctx, problem)); filter.FindConvBwdDataAlgorithm(handle, input.desc, @@ -662,8 +646,8 @@ struct verify_forward_conv : conv_base 1, &ret_algo_count, &perf, - ws_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); } count = filter.GetSolutionCount(ctx, problem); @@ -696,7 +680,7 @@ struct verify_forward_conv : conv_base << " != " << ws_size << std::endl; } } - resize_workspace(handle, selected.workspace_size, ws, ws_dev); + wspace.resize(selected.workspace_size); filter.CompileSolution(ctx, problem, selected.solution_id); @@ -707,8 +691,8 @@ struct verify_forward_conv : conv_base wei_dev.get(), rout.desc, out_dev.get(), - ws_dev.get(), - selected.workspace_size, + wspace.ptr(), + wspace.size(), selected.solution_id); } else @@ -717,8 +701,7 @@ struct verify_forward_conv : conv_base { int ret_algo_count; miopenConvAlgoPerf_t perf; - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - resize_workspace(handle, workspace_size, ws, ws_dev); + wspace.resize(filter.GetWorkSpaceSize(ctx, problem)); filter.FindConvFwdAlgorithm(handle, input.desc, @@ -730,8 +713,8 @@ struct verify_forward_conv : conv_base 1, &ret_algo_count, &perf, - ws_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); } @@ -765,7 +748,7 @@ struct verify_forward_conv : conv_base << " != " << ws_size << std::endl; } } - resize_workspace(handle, selected.workspace_size, ws, ws_dev); + wspace.resize(selected.workspace_size); filter.CompileSolution(ctx, problem, selected.solution_id); @@ -776,8 +759,8 @@ struct verify_forward_conv : conv_base in_dev.get(), rout.desc, out_dev.get(), - ws_dev.get(), - selected.workspace_size, + wspace.ptr(), + wspace.size(), selected.solution_id); } break; @@ -832,9 +815,7 @@ struct verify_forward_conv : conv_base if(api == ConvApi::Find_1_0) { - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - std::vector workspace(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + wspace.resize(filter.GetWorkSpaceSize(ctx, problem)); int ret_algo_count; miopenConvAlgoPerf_t perf; @@ -851,15 +832,13 @@ struct verify_forward_conv : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); - workspace_dev.reset(); if(perf.memory > 0) { - workspace.resize(perf.memory); - workspace_dev = handle.Write(workspace); + wspace.resize(perf.memory); } filter.ConvolutionForward(handle, @@ -872,8 +851,8 @@ struct verify_forward_conv : conv_base &beta, rout.desc, out_dev.get(), - workspace_dev.get(), - workspace_size); + wspace.ptr(), + wspace.size()); /// \ref read_solver_name auto solutions = filter.GetSolutions(ctx, problem, 1, &fallback_path_taken); @@ -904,9 +883,7 @@ struct verify_forward_conv : conv_base { if(api == ConvApi::Find_1_0) { - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - std::vector workspace(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + wspace.resize(filter.GetWorkSpaceSize(ctx, problem)); int ret_algo_count; miopenConvAlgoPerf_t perf; @@ -925,15 +902,13 @@ struct verify_forward_conv : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); - workspace_dev.reset(); if(perf.memory > 0) { - workspace.resize(perf.memory); - workspace_dev = handle.Write(workspace); + wspace.resize(perf.memory); } filter.ConvolutionBackwardData(handle, @@ -946,8 +921,8 @@ struct verify_forward_conv : conv_base &beta, rout.desc, out_dev.get(), - workspace_dev.get(), - workspace_size); + wspace.ptr(), + wspace.size()); } else { @@ -961,15 +936,13 @@ struct verify_forward_conv : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); - workspace_dev.reset(); if(perf.memory > 0) { - workspace.resize(perf.memory); - workspace_dev = handle.Write(workspace); + wspace.resize(perf.memory); } filter.ConvolutionForward(handle, @@ -982,8 +955,8 @@ struct verify_forward_conv : conv_base &beta, rout.desc, out_dev.get(), - workspace_dev.get(), - workspace_size); + wspace.ptr(), + wspace.size()); } /// \ref read_solver_name @@ -1118,6 +1091,8 @@ struct verify_backward_conv : conv_base auto wei_dev = handle.Write(weights.data); auto in_dev = handle.Write(rinput.data); + Workspace wspace{}; + miopenConvSolution_t selected; bool fallback_path_taken = false; std::size_t count = 0; @@ -1133,9 +1108,7 @@ struct verify_backward_conv : conv_base switch(api) { case ConvApi::Immediate: { - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - std::vector workspace(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + wspace.resize(filter.GetWorkSpaceSize(ctx, problem)); int ret_algo_count; miopenConvAlgoPerf_t perf; @@ -1154,8 +1127,8 @@ struct verify_backward_conv : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); } count = filter.GetSolutionCount(ctx, problem); @@ -1183,18 +1156,14 @@ struct verify_backward_conv : conv_base }); selected = std::move(solutions.front()); - std::size_t ws_size; - - ws_size = filter.GetForwardSolutionWorkspaceSize( + [[maybe_unused]] std::size_t ws_size = filter.GetForwardSolutionWorkspaceSize( handle, weights.desc, out.desc, rinput.desc, selected.solution_id); filter.CompileSolution(ctx, problem, selected.solution_id); - workspace_dev.reset(); if(selected.workspace_size > 0) { - workspace.resize(selected.workspace_size); - workspace_dev = handle.Write(workspace); + wspace.resize(selected.workspace_size); } filter.ConvolutionForwardImmediate(handle, @@ -1204,8 +1173,8 @@ struct verify_backward_conv : conv_base out_dev.get(), rinput.desc, in_dev.get(), - workspace_dev.get(), - ws_size, + wspace.ptr(), + wspace.size(), selected.solution_id); } else @@ -1222,8 +1191,8 @@ struct verify_backward_conv : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); } count = filter.GetSolutionCount(ctx, problem); @@ -1250,18 +1219,14 @@ struct verify_backward_conv : conv_base }); selected = std::move(solutions.front()); - std::size_t ws_size; - - ws_size = filter.GetBackwardSolutionWorkspaceSize( + [[maybe_unused]] std::size_t ws_size = filter.GetBackwardSolutionWorkspaceSize( handle, out.desc, weights.desc, rinput.desc, selected.solution_id); filter.CompileSolution(ctx, problem, selected.solution_id); - workspace_dev.reset(); if(selected.workspace_size > 0) { - workspace.resize(selected.workspace_size); - workspace_dev = handle.Write(workspace); + wspace.resize(selected.workspace_size); } filter.ConvolutionBackwardImmediate(handle, @@ -1271,16 +1236,14 @@ struct verify_backward_conv : conv_base wei_dev.get(), rinput.desc, in_dev.get(), - workspace_dev.get(), - ws_size, + wspace.ptr(), + wspace.size(), selected.solution_id); } break; } case ConvApi::Find_1_0: { - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - std::vector workspace(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + wspace.resize(filter.GetWorkSpaceSize(ctx, problem)); int ret_algo_count; miopenConvAlgoPerf_t perf; @@ -1299,15 +1262,13 @@ struct verify_backward_conv : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); - workspace_dev.reset(); if(perf.memory > 0) { - workspace.resize(perf.memory); - workspace_dev = handle.Write(workspace); + wspace.resize(perf.memory); } filter.ConvolutionForward(handle, @@ -1320,8 +1281,8 @@ struct verify_backward_conv : conv_base &beta, rinput.desc, in_dev.get(), - workspace_dev.get(), - workspace_size); + wspace.ptr(), + wspace.size()); } else { @@ -1335,15 +1296,13 @@ struct verify_backward_conv : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); - workspace_dev.reset(); if(perf.memory > 0) { - workspace.resize(perf.memory); - workspace_dev = handle.Write(workspace); + wspace.resize(perf.memory); } filter.ConvolutionBackwardData(handle, @@ -1356,8 +1315,8 @@ struct verify_backward_conv : conv_base &beta, rinput.desc, in_dev.get(), - workspace_dev.get(), - workspace_size); + wspace.ptr(), + wspace.size()); } /// \ref read_solver_name @@ -1487,6 +1446,7 @@ struct verify_backward_weights_conv : conv_base auto out_dev = handle.Write(out.data); auto wei_dev = handle.Write(rweights.data); auto in_dev = handle.Write(input.data); + Workspace wspace{}; miopenConvSolution_t selected; bool fallback_path_taken = false; @@ -1503,9 +1463,7 @@ struct verify_backward_weights_conv : conv_base switch(api) { case ConvApi::Immediate: { - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - std::vector workspace(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + wspace.resize(filter.GetWorkSpaceSize(ctx, problem)); int ret_algo_count; miopenConvAlgoPerf_t perf; @@ -1523,8 +1481,8 @@ struct verify_backward_weights_conv : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); } @@ -1551,9 +1509,7 @@ struct verify_backward_weights_conv : conv_base }); selected = std::move(solutions.front()); - std::size_t ws_size; - - ws_size = filter.GetWrwSolutionWorkspaceSize( + [[maybe_unused]] std::size_t ws_size = filter.GetWrwSolutionWorkspaceSize( handle, filter.mode == miopenTranspose ? input.desc : out.desc, filter.mode == miopenTranspose ? out.desc : input.desc, @@ -1562,11 +1518,9 @@ struct verify_backward_weights_conv : conv_base filter.CompileSolution(ctx, problem, selected.solution_id); - workspace_dev.reset(); if(selected.workspace_size > 0) { - workspace.resize(selected.workspace_size); - workspace_dev = handle.Write(workspace); + wspace.resize(selected.workspace_size); } filter.ConvolutionWrwImmediate( @@ -1577,16 +1531,14 @@ struct verify_backward_weights_conv : conv_base filter.mode == miopenTranspose ? out_dev.get() : in_dev.get(), rweights.desc, wei_dev.get(), - workspace_dev.get(), - ws_size, + wspace.ptr(), + wspace.size(), selected.solution_id); break; } case ConvApi::Find_1_0: { - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - std::vector workspace(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + wspace.resize(filter.GetWorkSpaceSize(ctx, problem)); int ret_algo_count; miopenConvAlgoPerf_t perf; @@ -1603,15 +1555,13 @@ struct verify_backward_weights_conv : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); - workspace_dev.reset(); if(perf.memory > 0) { - workspace.resize(perf.memory); - workspace_dev = handle.Write(workspace); + wspace.resize(perf.memory); } filter.ConvolutionBackwardWeights( @@ -1625,8 +1575,8 @@ struct verify_backward_weights_conv : conv_base &beta, rweights.desc, wei_dev.get(), - workspace_dev.get(), - workspace_size); + wspace.ptr(), + wspace.size()); /// \ref read_solver_name const auto solutions = filter.GetSolutions(ctx, problem, 1, &fallback_path_taken); @@ -1778,9 +1728,7 @@ struct verify_forward_conv_int8 : conv_base wei_vpad_dev.get()); } - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - std::vector workspace(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + Workspace wspace{filter.GetWorkSpaceSize(ctx, problem)}; int ret_algo_count; miopenConvAlgoPerf_t perf; @@ -1797,8 +1745,8 @@ struct verify_forward_conv_int8 : conv_base 1, &ret_algo_count, &perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), search); } @@ -1826,9 +1774,7 @@ struct verify_forward_conv_int8 : conv_base }); auto selected = std::move(solutions.front()); - std::size_t ws_size; - - ws_size = + [[maybe_unused]] std::size_t ws_size = filter.GetForwardSolutionWorkspaceSize(handle, (is_transform ? weight_vpad_desc : weights.desc), (is_transform ? input_vpad_desc : input.desc), @@ -1837,11 +1783,9 @@ struct verify_forward_conv_int8 : conv_base filter.CompileSolution(ctx, problem, selected.solution_id); - workspace_dev.reset(); if(selected.workspace_size > 0) { - workspace.resize(selected.workspace_size); - workspace_dev = handle.Write(workspace); + wspace.resize(selected.workspace_size); } filter.ConvolutionForwardImmediate(handle, @@ -1851,8 +1795,8 @@ struct verify_forward_conv_int8 : conv_base (is_transform ? in_vpad_dev.get() : in_dev.get()), rout.desc, out_dev.get(), - workspace_dev.get(), - ws_size, + wspace.ptr(), + wspace.size(), selected.solution_id); if(count != 0) diff --git a/test/ctc.cpp b/test/ctc.cpp index d812cef801..1c759220f2 100644 --- a/test/ctc.cpp +++ b/test/ctc.cpp @@ -29,6 +29,7 @@ #include "tensor_holder.hpp" #include "test.hpp" #include "verify.hpp" +#include "workspace.hpp" #include "rnn_util.hpp" #include "random.hpp" #include @@ -651,8 +652,7 @@ struct verify_ctcloss inputLengths.data(), miopenCTCLossAlgo_t(0)); - auto workSpace = tensor{workSpaceSize / sizeof(T)}; - auto workSpace_dev = handle.Write(workSpace.data); + Workspace wspace{workSpaceSize}; auto losses_gpu = losses; auto grads_gpu = grads; @@ -671,8 +671,8 @@ struct verify_ctcloss grads.desc, grads_dev.get(), miopenCTCLossAlgo_t(0), - workSpace_dev.get(), - workSpaceSize); + wspace.ptr(), + wspace.size()); losses_gpu.data = handle.Read(losses_dev, losses_gpu.data.size()); grads_gpu.data = handle.Read(grads_dev, grads_gpu.data.size()); diff --git a/test/find_2_conv.cpp b/test/find_2_conv.cpp index a89ce942a7..6e636e265f 100644 --- a/test/find_2_conv.cpp +++ b/test/find_2_conv.cpp @@ -27,6 +27,7 @@ #include "test.hpp" #include "driver.hpp" #include "get_handle.hpp" +#include "workspace.hpp" #include @@ -210,13 +211,11 @@ struct Find2Test : test_driver } const auto workspace_size = std::min(workspace_limit, workspace_max); - workspace_dev = workspace_size != 0 - ? miopen::deref(handle).Write(std::vector(workspace_size)) - : nullptr; + Workspace wspace{workspace_size}; - EXPECT_EQUAL(miopenSetFindOptionPreallocatedWorkspace( - options, workspace_dev.get(), workspace_size), - miopenStatusSuccess); + EXPECT_EQUAL( + miopenSetFindOptionPreallocatedWorkspace(options, wspace.ptr(), wspace.size()), + miopenStatusSuccess); EXPECT_EQUAL(miopenSetFindOptionPreallocatedTensor( options, miopenTensorConvolutionX, x_dev.get()), @@ -312,14 +311,11 @@ struct Find2Test : test_driver { std::cerr << "Running a solution..." << std::endl; - auto& handle_deref = get_handle(); - std::size_t workspace_size; EXPECT_EQUAL(miopenGetSolutionWorkspaceSize(solution, &workspace_size), miopenStatusSuccess); - auto workspace_dev = - workspace_size != 0 ? handle_deref.Write(std::vector(workspace_size)) : nullptr; + Workspace wspace{workspace_size}; const auto checked_run_solution = [&](miopenTensorDescriptor_t* descriptors_) { auto arguments = std::make_unique(num_arguments); @@ -331,10 +327,9 @@ struct Find2Test : test_driver arguments[i].buffer = buffers[i]; } - EXPECT_EQUAL( - miopenRunSolution( - handle, solution, 3, arguments.get(), workspace_dev.get(), workspace_size), - miopenStatusSuccess); + EXPECT_EQUAL(miopenRunSolution( + handle, solution, 3, arguments.get(), wspace.ptr(), wspace.size()), + miopenStatusSuccess); }; // Without descriptors diff --git a/test/find_db.cpp b/test/find_db.cpp index b07167f80e..47c485182e 100644 --- a/test/find_db.cpp +++ b/test/find_db.cpp @@ -27,6 +27,7 @@ #include "test.hpp" #include "driver.hpp" #include "get_handle.hpp" +#include "workspace.hpp" #include #include @@ -111,10 +112,7 @@ struct FindDbTest : test_driver const auto ctx = ExecutionContext{&handle}; const auto problem = conv::ProblemDescription{y.desc, w.desc, x.desc, filter, conv::Direction::BackwardData}; - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - - auto workspace = std::vector(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + Workspace wspace{filter.GetWorkSpaceSize(ctx, problem)}; auto filterCall = [&]() { int ret_algo_count; @@ -130,8 +128,8 @@ struct FindDbTest : test_driver 1, &ret_algo_count, perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), false); }; @@ -145,10 +143,7 @@ struct FindDbTest : test_driver const auto ctx = ExecutionContext{&handle}; const auto problem = conv::ProblemDescription{x.desc, w.desc, y.desc, filter, conv::Direction::Forward}; - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - - auto workspace = std::vector(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + Workspace wspace{filter.GetWorkSpaceSize(ctx, problem)}; auto filterCall = [&]() { int ret_algo_count; @@ -164,8 +159,8 @@ struct FindDbTest : test_driver 1, &ret_algo_count, perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), false); }; @@ -179,10 +174,7 @@ struct FindDbTest : test_driver const auto ctx = ExecutionContext{&handle}; const auto problem = conv::ProblemDescription{ y.desc, w.desc, x.desc, filter, conv::Direction::BackwardWeights}; - const auto workspace_size = filter.GetWorkSpaceSize(ctx, problem); - - auto workspace = std::vector(workspace_size); - auto workspace_dev = workspace_size != 0 ? handle.Write(workspace) : nullptr; + Workspace wspace{filter.GetWorkSpaceSize(ctx, problem)}; auto filterCall = [&]() { int ret_algo_count; @@ -198,8 +190,8 @@ struct FindDbTest : test_driver 1, &ret_algo_count, perf, - workspace_dev.get(), - workspace_size, + wspace.ptr(), + wspace.size(), false); }; diff --git a/test/gru_common.hpp b/test/gru_common.hpp index 51e3c09012..4df6209e9d 100644 --- a/test/gru_common.hpp +++ b/test/gru_common.hpp @@ -35,6 +35,7 @@ #include "verify.hpp" #include "rnn_util.hpp" #include "random.hpp" +#include "workspace.hpp" #include #include #include @@ -1963,8 +1964,8 @@ struct verify_forward_infer_gru #endif auto&& handle = get_handle(); - size_t out_sz = 0; - size_t workSpaceSize = 0; + size_t out_sz = 0; + size_t workspace_size = 0; std::vector inputCPPDescs; std::vector inputDescs; @@ -1979,9 +1980,8 @@ struct verify_forward_infer_gru hiddenSize * ((dirMode != 0) ? 2 : 1), miopen::deref(rnnDesc).dataType); - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); - - std::vector workSpace(workSpaceSize / sizeof(T)); + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); + Workspace wspace{workspace_size}; auto input_dev = handle.Write(input); @@ -1994,8 +1994,6 @@ struct verify_forward_infer_gru std::fill(hy.begin(), hy.end(), 0.); auto hy_dev = handle.Write(hy); - auto workSpace_dev = handle.Write(workSpace); - std::vector hlens(3, 0); hlens[0] = nLayers * (dirMode != 0 ? 2 : 1); hlens[1] = batch_seq[0]; @@ -2027,8 +2025,8 @@ struct verify_forward_infer_gru ((nohy) ? nullptr : hy_dev.get()), &hiddenDesc, nullptr, - workSpace_dev.get(), - workSpaceSize); + wspace.ptr(), + wspace.size()); #if(MIO_GRU_TEST_DEBUG == 2) auto outdata = handle.Read(output_dev, output.size()); @@ -2249,7 +2247,7 @@ struct verify_forward_train_gru auto&& handle = get_handle(); size_t out_sz = 0; - size_t workSpaceSize = 0; + size_t workspace_size = 0; size_t reserveSpaceSize = 0; std::vector inputCPPDescs; @@ -2265,12 +2263,14 @@ struct verify_forward_train_gru hiddenSize * ((dirMode != 0) ? 2 : 1), miopen::deref(rnnDesc).dataType); - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); + Workspace wspace{workspace_size}; + miopenGetRNNTrainingReserveSize( &handle, rnnDesc, seqLength, inputDescs.data(), &reserveSpaceSize); - - std::vector workSpace(workSpaceSize / sizeof(T)); - std::vector reserveSpace((reserveSpaceSize + sizeof(T) - 1) / sizeof(T)); + reserveSpaceSize = (reserveSpaceSize + sizeof(T) - 1) & ~(sizeof(T) - 1); + assert(reserveSpaceSize % sizeof(T) == 0); + Workspace rspace{reserveSpaceSize}; auto input_dev = handle.Write(input); @@ -2284,9 +2284,6 @@ struct verify_forward_train_gru std::fill(hy.begin(), hy.end(), 0.); auto hy_dev = handle.Write(hy); - auto workSpace_dev = handle.Write(workSpace); - auto reserveSpace_dev = handle.Write(reserveSpace); - std::vector hlens(3, 0); hlens[0] = nLayers * (dirMode != 0 ? 2 : 1); hlens[1] = batch_seq[0]; @@ -2318,10 +2315,10 @@ struct verify_forward_train_gru ((nohy) ? nullptr : hy_dev.get()), &hiddenDesc, nullptr, - workSpace_dev.get(), - workSpaceSize, - reserveSpace_dev.get(), - reserveSpaceSize); + wspace.ptr(), + wspace.size(), + rspace.ptr(), + rspace.size()); #if(MIO_GRU_TEST_DEBUG == 2) auto outdata = handle.Read(output_dev, output.size()); @@ -2331,10 +2328,9 @@ struct verify_forward_train_gru } #endif - auto retSet = std::make_tuple( - handle.Read(output_dev, output.size()), - (nohy ? initHidden : handle.Read(hy_dev, hy.size())), - handle.Read(reserveSpace_dev, (reserveSpaceSize + sizeof(T) - 1) / sizeof(T))); + auto retSet = std::make_tuple(handle.Read(output_dev, output.size()), + (nohy ? initHidden : handle.Read(hy_dev, hy.size())), + rspace.Read>()); #if(MIO_RNN_TIME_EVERYTHING == 1) auto t_end = std::chrono::high_resolution_clock::now(); @@ -2484,7 +2480,7 @@ struct verify_backward_data_gru int bi = dirMode != 0 ? 2 : 1; int hy_h = hiddenSize; int bi_stride = bi * hy_h; - size_t workSpaceSize; + size_t workspace_size; std::vector inputCPPDescs; std::vector inputDescs; @@ -2494,8 +2490,8 @@ struct verify_backward_data_gru // Outputs ---------- size_t in_sz = 0; miopenGetRNNInputTensorSize(&handle, rnnDesc, seqLength, inputDescs.data(), &in_sz); - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); - std::vector workSpace(workSpaceSize / sizeof(T)); + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); + std::vector workSpace(workspace_size / sizeof(T)); std::vector dx(in_sz / sizeof(T)); std::vector dhx(initHidden.size()); @@ -2562,8 +2558,7 @@ struct verify_backward_data_gru auto&& handle = get_handle(); - size_t out_sz = 0; - size_t workSpaceSize = 0; + size_t out_sz = 0; std::vector inputCPPDescs; std::vector inputDescs; @@ -2578,15 +2573,17 @@ struct verify_backward_data_gru hiddenSize * ((dirMode != 0) ? 2 : 1), miopen::deref(rnnDesc).dataType); - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); - std::vector workSpace(workSpaceSize / sizeof(T)); - auto workSpace_dev = handle.Write(workSpace); + size_t workspace_size = 0; + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); + Workspace wspace{workspace_size}; miopenGetRNNInputTensorSize(&handle, rnnDesc, seqLength, outputDescs.data(), &out_sz); - auto yin_dev = handle.Write(yin); - auto dyin_dev = handle.Write(dy); - auto reserveSpace_dev = handle.Write(reserveSpace); - auto weights_dev = handle.Write(weights); + auto yin_dev = handle.Write(yin); + auto dyin_dev = handle.Write(dy); + auto weights_dev = handle.Write(weights); + + Workspace rspace{}; + rspace.Write(reserveSpace); std::vector hlens(3, 0); hlens[0] = nLayers * (dirMode != 0 ? 2 : 1); @@ -2633,15 +2630,15 @@ struct verify_backward_data_gru ((nodhx) ? nullptr : dhx_dev.get()), &hiddenDesc, nullptr, - workSpace_dev.get(), - workSpaceSize, - reserveSpace_dev.get(), - reserveSpace.size() * sizeof(T)); + wspace.ptr(), + wspace.size(), + rspace.ptr(), + rspace.size()); auto retSet = std::make_tuple(handle.Read(dx_dev, dx.size()), (nodhx ? initHidden : handle.Read(dhx_dev, dhx.size())), - handle.Read(reserveSpace_dev, reserveSpace.size()), - handle.Read(workSpace_dev, workSpace.size())); + rspace.Read>(), + wspace.Read>()); #if(MIO_RNN_TIME_EVERYTHING == 1) auto t_end = std::chrono::high_resolution_clock::now(); @@ -2840,8 +2837,11 @@ struct verify_backward_weights_gru hiddenSize * ((dirMode != 0) ? 2 : 1), miopen::deref(rnnDesc).dataType); - auto workSpace_dev = handle.Write(workSpace); - auto reserveSpace_dev = handle.Write(reserveSpace); + Workspace wspace{}; + wspace.Write(workSpace); + Workspace rspace{}; + rspace.Write(reserveSpace); + std::vector dweights(weightSize); auto dweights_dev = handle.Write(dweights); miopen::TensorDescriptor weightDesc(miopen::deref(rnnDesc).dataType, {weightSize}); @@ -2869,10 +2869,10 @@ struct verify_backward_weights_gru dy_dev.get(), &weightDesc, dweights_dev.get(), - workSpace_dev.get(), - workSpace.size() * sizeof(T), - reserveSpace_dev.get(), - reserveSpace.size() * sizeof(T)); + wspace.ptr(), + wspace.size(), + rspace.ptr(), + rspace.size()); #if(MIO_RNN_TIME_EVERYTHING == 1) auto t_end = std::chrono::high_resolution_clock::now(); @@ -3123,10 +3123,10 @@ struct gru_basic_driver : test_driver size_t reserveSpaceSize; miopenGetRNNTrainingReserveSize( &handle, rnnDesc, seqLength, inputDescs.data(), &reserveSpaceSize); - size_t workSpaceSize; - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); + size_t workspace_size; + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); - size_t total_mem = statesSizeInBytes + reserveSpaceSize + workSpaceSize + 2 * out_sz + + size_t total_mem = statesSizeInBytes + reserveSpaceSize + workspace_size + 2 * out_sz + (in_sz + wei_sz + (nohx ? 0 : hx_sz) + (nohy ? 0 : hx_sz) + (nodhx ? 0 : hx_sz) + (nodhy ? 0 : hx_sz)) * sizeof(T); diff --git a/test/gtest/conv_api_strided_tensors.cpp b/test/gtest/conv_api_strided_tensors.cpp index 9a2876b3f0..d4e49201f4 100644 --- a/test/gtest/conv_api_strided_tensors.cpp +++ b/test/gtest/conv_api_strided_tensors.cpp @@ -29,6 +29,7 @@ #include #include #include "platform.hpp" +#include "../workspace.hpp" #define MIOPEN_CHECK_RET(val) ASSERT_EQ(val, miopenStatusSuccess) @@ -66,10 +67,12 @@ class ConvStridedTensors : public ::testing::Test MIOPEN_CHECK_RET(miopenSetConvolutionGroupCount(conv_descr, 1)); // Workspace + size_t sz = 0; MIOPEN_CHECK_RET(miopenConvolutionForwardGetWorkSpaceSize( - handle, filter_descr, input_descr, conv_descr, output_descr, &workspace_size)); + handle, filter_descr, input_descr, conv_descr, output_descr, &sz)); // Data + wspace.resize(sz); h_input.resize(input_size); h_filter.resize(filter_size); h_output.resize(output_size); @@ -125,7 +128,7 @@ class ConvStridedTensors : public ::testing::Test std::vector dilation = {1, 1, 1}; // Workspace - size_t workspace_size; + Workspace wspace{}; // Data const size_t input_size = input_dims[0] * input_strides[0]; @@ -143,10 +146,9 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) { auto device = Device(handle); - auto d_workspace = device.Malloc(workspace_size); - auto d_input = device.Malloc(input_bytes); - auto d_filter = device.Malloc(filter_bytes); - auto d_output = device.Malloc(output_bytes); + auto d_input = device.Malloc(input_bytes); + auto d_filter = device.Malloc(filter_bytes); + auto d_output = device.Malloc(output_bytes); std::fill_n(h_input.begin(), h_input.size(), 1.f); ASSERT_TRUE(d_input.CopyToDevice(h_input.data(), input_bytes)); @@ -168,8 +170,8 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) sizeof(perf_results) / sizeof(perf_results[0]), &perf_results_count, perf_results, - d_workspace.Data(), - workspace_size, + wspace.ptr(), + wspace.size(), true), miopenStatusSuccess); ASSERT_GT(perf_results_count, 0); @@ -189,8 +191,8 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) &beta, output_descr, d_output.Data(), - d_workspace.Data(), - workspace_size), + wspace.ptr(), + wspace.size()), miopenStatusSuccess); ASSERT_TRUE(device.Synchronize()); } diff --git a/test/gtest/solver_bwd.hpp b/test/gtest/solver_bwd.hpp index cb55a5951b..6511800ce6 100644 --- a/test/gtest/solver_bwd.hpp +++ b/test/gtest/solver_bwd.hpp @@ -40,6 +40,7 @@ #include "conv_tensor_gen.hpp" #include "get_solver.hpp" +#include "../workspace.hpp" template struct ConvBwdSolverTest @@ -76,15 +77,11 @@ struct ConvBwdSolverTest if(solv.MayNeedWorkspace()) { const auto cur_sol_ws = solv.GetWorkspaceSize(ctx, problem); - workspace_dev = handle.Create(cur_sol_ws); - workspace_size = cur_sol_ws; + wspace.resize(cur_sol_ws); } - const auto invoke_params = - miopen::conv::DataInvokeParams{tensors, - workspace_dev.get(), - workspace_size, - conv_desc.attribute.gfx90aFp16alt.GetBwd()}; + const auto invoke_params = miopen::conv::DataInvokeParams{ + tensors, wspace.ptr(), wspace.size(), conv_desc.attribute.gfx90aFp16alt.GetBwd()}; auto sol = GetSolution(solv, ctx, problem); ASSERT_TRUE(sol.Succeeded()); @@ -177,8 +174,7 @@ struct ConvBwdSolverTest miopen::Allocator::ManageDataPtr in_dev; miopen::Allocator::ManageDataPtr wei_dev; miopen::Allocator::ManageDataPtr out_dev; - miopen::Allocator::ManageDataPtr workspace_dev; - size_t workspace_size; + Workspace wspace{}; miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect; bool test_skipped = false; }; diff --git a/test/gtest/solver_fwd.hpp b/test/gtest/solver_fwd.hpp index 88fa9a9c55..20b16fcc32 100644 --- a/test/gtest/solver_fwd.hpp +++ b/test/gtest/solver_fwd.hpp @@ -38,6 +38,7 @@ #include "conv_test_base.hpp" #include "get_solver.hpp" +#include "../workspace.hpp" template struct ConvFwdSolverTest @@ -77,15 +78,11 @@ struct ConvFwdSolverTest if(solv.MayNeedWorkspace()) { const auto cur_sol_ws = solv.GetWorkspaceSize(ctx, problem); - workspace_dev = handle.Create(cur_sol_ws); - workspace_size = cur_sol_ws; + wspace.resize(cur_sol_ws); } - const auto invoke_params = - miopen::conv::DataInvokeParams{tensors, - workspace_dev.get(), - workspace_size, - this->conv_desc.attribute.gfx90aFp16alt.GetFwd()}; + const auto invoke_params = miopen::conv::DataInvokeParams{ + tensors, wspace.ptr(), wspace.size(), this->conv_desc.attribute.gfx90aFp16alt.GetFwd()}; // auto sol = solv.GetSolution(ctx, problem); // This is complicated due to the split between tunable and non-tunable solvers @@ -116,8 +113,7 @@ struct ConvFwdSolverTest } ConvTestCase conv_config; - miopen::Allocator::ManageDataPtr workspace_dev; - size_t workspace_size; + Workspace wspace{}; miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect; bool test_skipped = false; miopenTensorLayout_t tensor_layout; diff --git a/test/gtest/solver_wrw.hpp b/test/gtest/solver_wrw.hpp index dde92e2071..dcf8311d83 100644 --- a/test/gtest/solver_wrw.hpp +++ b/test/gtest/solver_wrw.hpp @@ -40,6 +40,7 @@ #include "conv_tensor_gen.hpp" #include "get_solver.hpp" +#include "../workspace.hpp" template struct ConvWrwSolverTest @@ -76,15 +77,11 @@ struct ConvWrwSolverTest if(solv.MayNeedWorkspace()) { const auto cur_sol_ws = solv.GetWorkspaceSize(ctx, problem); - workspace_dev = handle.Create(cur_sol_ws); - workspace_size = cur_sol_ws; + wspace.resize(cur_sol_ws); } - const auto invoke_params = - miopen::conv::WrWInvokeParams{tensors, - workspace_dev.get(), - workspace_size, - conv_desc.attribute.gfx90aFp16alt.GetBwd()}; + const auto invoke_params = miopen::conv::WrWInvokeParams{ + tensors, wspace.ptr(), wspace.size(), conv_desc.attribute.gfx90aFp16alt.GetBwd()}; auto sol = GetSolution(solv, ctx, problem); ASSERT_TRUE(sol.Succeeded()); @@ -179,8 +176,7 @@ struct ConvWrwSolverTest miopen::Allocator::ManageDataPtr in_dev; miopen::Allocator::ManageDataPtr wei_dev; miopen::Allocator::ManageDataPtr out_dev; - miopen::Allocator::ManageDataPtr workspace_dev; - size_t workspace_size; + Workspace wspace{}; miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect; bool test_skipped = false; }; diff --git a/test/lstm_common.hpp b/test/lstm_common.hpp index a09aab4209..d2b7d1a077 100644 --- a/test/lstm_common.hpp +++ b/test/lstm_common.hpp @@ -27,6 +27,7 @@ #ifndef GUARD_MIOPEN_TEST_LSTM_COMMON_HPP #define GUARD_MIOPEN_TEST_LSTM_COMMON_HPP +#include "workspace.hpp" #include "driver.hpp" #include "dropout_util.hpp" #include "get_handle.hpp" @@ -96,6 +97,8 @@ struct verify_backward_data_lstm std::vector initHidden; // HX std::vector initCell; // CX std::vector weights; + std::vector& RSVgpu; + std::vector& RSVcpu; std::vector batch_seq; int hiddenSize; int seqLength; @@ -115,8 +118,6 @@ struct verify_backward_data_lstm bool nodcx; bool use_dropout; bool use_seqPadding; - typename std::vector::iterator RSVgpu; - typename std::vector::iterator RSVcpu; verify_backward_data_lstm(miopenRNNDescriptor_t pRD, const std::vector& py, @@ -153,6 +154,8 @@ struct verify_backward_data_lstm initHidden(phx), initCell(pcx), weights(pW), + RSVgpu(pRSVgpu), + RSVcpu(pRSVcpu), batch_seq(pBS), hiddenSize(pHS), seqLength(pS), @@ -171,9 +174,7 @@ struct verify_backward_data_lstm nodhx(pnodhx), nodcx(pnodcx), use_dropout(puse_dropout), - use_seqPadding(puse_seqPadding), - RSVgpu(pRSVgpu.begin()), - RSVcpu(pRSVcpu.begin()) + use_seqPadding(puse_seqPadding) { if(!nohx) initHidden = phx; // this may be intentionally a nullptr @@ -248,6 +249,8 @@ struct verify_backward_weights_lstm std::vector input; // Y std::vector dy; // dY std::vector initHidden; // HX + std::vector reserveSpace_gpu; + std::vector reserveSpace_cpu; std::vector workSpace; std::vector batch_seq; int weightSize; @@ -264,8 +267,6 @@ struct verify_backward_weights_lstm bool nohx; bool use_dropout; bool use_seqPadding; - typename std::vector reserveSpace_gpu; - typename std::vector reserveSpace_cpu; verify_backward_weights_lstm(miopenRNNDescriptor_t pRD, const std::vector& px, @@ -291,6 +292,8 @@ struct verify_backward_weights_lstm : input(px), dy(pdy), initHidden(phx), + reserveSpace_gpu(pRSVgpu), + reserveSpace_cpu(pRSVcpu), workSpace(pWS), batch_seq(pBS), weightSize(pW), @@ -306,9 +309,7 @@ struct verify_backward_weights_lstm realHiddenSize(pHXZ), nohx(pnohx), use_dropout(puse_dropout), - use_seqPadding(puse_seqPadding), - reserveSpace_gpu(pRSVgpu), - reserveSpace_cpu(pRSVcpu) + use_seqPadding(puse_seqPadding) { if(!nohx) initHidden = phx; // this may be intentionally a nullptr @@ -533,8 +534,7 @@ struct verify_forward_infer_lstm : verify_forward_lstm auto&& handle = get_handle(); - size_t out_sz = 0; - size_t workSpaceSize = 0; + size_t out_sz = 0; std::vector inputCPPDescs; std::vector inputDescs; @@ -549,9 +549,10 @@ struct verify_forward_infer_lstm : verify_forward_lstm hiddenSize * ((dirMode != 0) ? 2 : 1), miopen::deref(rnnDesc).dataType); - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); + size_t workspace_size = 0; + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); - std::vector workSpace(workSpaceSize / sizeof(T)); + Workspace wspace{workspace_size}; auto input_dev = handle.Write(input); @@ -565,8 +566,6 @@ struct verify_forward_infer_lstm : verify_forward_lstm auto cy = initCell; std::fill(cy.begin(), cy.end(), 0.); - auto workSpace_dev = handle.Write(workSpace); - std::vector hlens(3, 0); hlens[0] = nLayers * (dirMode != 0 ? 2 : 1); hlens[1] = batch_seq[0]; @@ -577,6 +576,9 @@ struct verify_forward_infer_lstm : verify_forward_lstm wlen[0] = weights.size(); miopen::TensorDescriptor weightDesc(miopen::deref(rnnDesc).dataType, wlen); + /// \todo: fix the handle.Write() calls below because they generate + /// temporary objects that may get destroyed before the + /// miopenRNNForwardInference call happens miopenRNNForwardInference(&handle, rnnDesc, seqLength, @@ -594,8 +596,8 @@ struct verify_forward_infer_lstm : verify_forward_lstm ((nohy) ? nullptr : handle.Write(hy).get()), &hiddenDesc, ((nocy) ? nullptr : handle.Write(cy).get()), - workSpace_dev.get(), - workSpaceSize); + wspace.ptr(), + wspace.size()); #if(MIO_LSTM_TEST_DEBUG == 2) auto outdata = handle.Read(output_dev, output.size()); @@ -673,9 +675,10 @@ struct verify_forward_train_lstm : verify_forward_lstm using verify_forward_lstm::nocy; using verify_forward_lstm::use_seqPadding; + std::vector& RSVgpu; + std::vector& RSVcpu; + bool use_dropout; - typename std::vector::iterator RSVgpu; - typename std::vector::iterator RSVcpu; verify_forward_train_lstm(miopenRNNDescriptor_t pRD, const std::vector& px, @@ -700,7 +703,7 @@ struct verify_forward_train_lstm : verify_forward_lstm const bool pnocy = false, const bool puse_dropout = false, const bool puse_seqPadding = false) - : RSVgpu(pRSVgpu.begin()), RSVcpu(pRSVcpu.begin()) + : RSVgpu(pRSVgpu), RSVcpu(pRSVcpu) { input = px; initHidden = phx; @@ -861,7 +864,11 @@ struct verify_forward_train_lstm : verify_forward_lstm ChangeDataPadding(*packed_output, output, batch_seq, batch_seq[0], out_h, true); } - std::copy(reserveSpace.begin(), reserveSpace.end(), RSVcpu); + if(reserveSpace.size() != RSVcpu.size()) + { + std::abort(); + } + std::copy(reserveSpace.begin(), reserveSpace.end(), RSVcpu.begin()); auto retSet = std::make_tuple( output, (nohy ? initHidden : hiddenState), (nocy ? initCell : cellState)); @@ -908,17 +915,15 @@ struct verify_forward_train_lstm : verify_forward_lstm std::fill(output.begin(), output.end(), static_cast(0)); auto output_dev = handle.Write(output); - size_t workSpaceSize = 0; + size_t workspace_size = 0; + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); + Workspace wspace{workspace_size}; + size_t reserveSpaceSize = 0; - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); miopenGetRNNTrainingReserveSize( &handle, rnnDesc, seqLength, inputDescs.data(), &reserveSpaceSize); - - std::vector workSpace(workSpaceSize / sizeof(T)); - std::vector reserveSpace((reserveSpaceSize + sizeof(T) - 1) / sizeof(T)); - - auto workSpace_dev = handle.Write(workSpace); - auto reserveSpace_dev = handle.Write(reserveSpace); + reserveSpaceSize = (reserveSpaceSize + (sizeof(T) - 1)) & ~(sizeof(T) - 1); + Workspace rspace{reserveSpaceSize}; auto weights_dev = handle.Write(weights); @@ -957,10 +962,10 @@ struct verify_forward_train_lstm : verify_forward_lstm ((nohy) ? nullptr : hy_dev.get()), &hiddenDesc, ((nocy) ? nullptr : cy_dev.get()), - workSpace_dev.get(), - workSpaceSize, - reserveSpace_dev.get(), - reserveSpaceSize); + wspace.ptr(), + wspace.size(), + rspace.ptr(), + rspace.size()); #if(MIO_LSTM_TEST_DEBUG == 2) auto outdata = handle.Read(output_dev, output.size()); @@ -969,9 +974,7 @@ struct verify_forward_train_lstm : verify_forward_lstm printf("GPU outdata[%d]: %f\n", i, outdata[i]); } #endif - reserveSpace = - handle.Read(reserveSpace_dev, (reserveSpaceSize + sizeof(T) - 1) / sizeof(T)); - std::copy(reserveSpace.begin(), reserveSpace.end(), RSVgpu); + rspace.ReadTo(RSVgpu); std::vector output_gpu = handle.Read(output_dev, output.size()); @@ -1049,7 +1052,7 @@ verify_backward_data_lstm::cpu() const int hy_h = hiddenSize; int bi_stride = bi * hy_h; int out_h = hiddenSize * ((dirMode != 0) ? 2 : 1); - size_t workSpaceSize; + size_t workspace_size; std::vector inputCPPDescs; std::vector inputDescs; @@ -1066,8 +1069,8 @@ verify_backward_data_lstm::cpu() const true, use_seqPadding); - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); - std::vector workSpace(workSpaceSize / sizeof(T)); + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); + std::vector workSpace(workspace_size / sizeof(T)); std::vector dx(in_sz); std::vector dhx(initHidden.size()); std::vector dcx(initHidden.size()); @@ -1086,8 +1089,11 @@ verify_backward_data_lstm::cpu() const reserveSpaceSize = (reserveSpaceSize + sizeof(T) - 1) / sizeof(T); } - std::vector reserveSpace(reserveSpaceSize); - std::copy(RSVcpu, RSVcpu + reserveSpaceSize, reserveSpace.begin()); + if(reserveSpaceSize != RSVcpu.size()) + { + std::abort(); + } + std::vector reserveSpace(RSVcpu); std::vector converted_dinput; std::vector converted_output; @@ -1119,7 +1125,7 @@ verify_backward_data_lstm::cpu() const packed_doutput = &converted_doutput; // WA - wa_workSpace.resize(workSpaceSize / sizeof(T) - (packedXInSize + packedYOutSize)); + wa_workSpace.resize(workspace_size / sizeof(T) - (packedXInSize + packedYOutSize)); wa_shifted_workSpace = &wa_workSpace; } else @@ -1186,7 +1192,7 @@ verify_backward_data_lstm::cpu() const workSpace.begin() + converted_doutput.size() + converted_dinput.size()); } - std::copy(reserveSpace.begin(), reserveSpace.end(), RSVcpu); + std::copy(reserveSpace.begin(), reserveSpace.end(), RSVcpu.begin()); // TODO: remove workSpace auto retSet = @@ -1210,8 +1216,6 @@ verify_backward_data_lstm::gpu() const auto&& handle = get_handle(); - size_t workSpaceSize = 0; - std::vector inputCPPDescs; std::vector inputDescs; createTensorDescArray( @@ -1225,20 +1229,37 @@ verify_backward_data_lstm::gpu() const hiddenSize * ((dirMode != 0) ? 2 : 1), miopen::deref(rnnDesc).dataType); - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); - std::vector workSpace(workSpaceSize / sizeof(T)); - auto workSpace_dev = handle.Write(workSpace); - - size_t reserveSpaceSize; + size_t workspace_size = 0; + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); + if(workspace_size % sizeof(T) != 0) + { + std::abort(); + } + Workspace wspace{}; + // Needed to zero out the workspace (happens in std::vector's constructor) + // or else this test fails verification when workspace is compared against the + // workspace returned by ::cpu method in this class + wspace.Write(std::vector(workspace_size / sizeof(T))); + // wspace.resize(workspace_size); + + size_t reserveSpaceSize = 0; miopenGetRNNTrainingReserveSize( &handle, rnnDesc, seqLength, inputDescs.data(), &reserveSpaceSize); - std::vector reserveSpace((reserveSpaceSize + sizeof(T) - 1) / sizeof(T)); - std::copy(RSVgpu, RSVgpu + reserveSpace.size(), reserveSpace.begin()); + /// \todo: fix miopenGetRNNTrainingReserveSize to return a multiple of + /// sizeof(T) + // Needed because reserveSpaceSize returned is not a multiple of sizeof(T). + reserveSpaceSize = (reserveSpaceSize + (sizeof(T) - 1)) & ~(sizeof(T) - 1); - auto yin_dev = handle.Write(yin); - auto dyin_dev = handle.Write(dy); - auto reserveSpace_dev = handle.Write(reserveSpace); - auto weights_dev = handle.Write(weights); + if(reserveSpaceSize != (RSVgpu.size() * sizeof(T))) + { + std::abort(); + } + Workspace rspace{}; + rspace.Write(RSVgpu); + + auto yin_dev = handle.Write(yin); + auto dyin_dev = handle.Write(dy); + auto weights_dev = handle.Write(weights); std::vector hlens(3, 0); hlens[0] = nLayers * (dirMode != 0 ? 2 : 1); @@ -1284,18 +1305,18 @@ verify_backward_data_lstm::gpu() const ((nodhx) ? nullptr : dhx_dev.get()), &hiddenDesc, ((nodcx) ? nullptr : dcx_dev.get()), - workSpace_dev.get(), - workSpaceSize, - reserveSpace_dev.get(), - reserveSpace.size() * sizeof(T)); + wspace.ptr(), + wspace.size(), + rspace.ptr(), + rspace.size()); - reserveSpace = handle.Read(reserveSpace_dev, reserveSpace.size()); - std::copy(reserveSpace.begin(), reserveSpace.end(), RSVgpu); + assert(RSVgpu.size() * sizeof(T) == rspace.size()); + rspace.ReadTo(RSVgpu); // TODO: remove workSpace auto retSet = std::make_tuple(handle.Read(dx_dev, dx.size()), (nodhx ? initHidden : handle.Read(dhx_dev, dhx.size())), (nodcx ? initCell : handle.Read(dcx_dev, dcx.size())), - handle.Read(workSpace_dev, workSpace.size())); + wspace.Read>()); #if(MIO_RNN_TIME_EVERYTHING == 1) auto t_end = std::chrono::high_resolution_clock::now(); @@ -1427,8 +1448,12 @@ std::vector verify_backward_weights_lstm::gpu() const hiddenSize * ((dirMode != 0) ? 2 : 1), miopen::deref(rnnDesc).dataType); - auto workSpace_dev = handle.Write(workSpace); - auto reserveSpace_dev = handle.Write(reserveSpace_gpu); + Workspace wspace{}; + wspace.Write(workSpace); + + Workspace rspace{}; + rspace.Write(reserveSpace_gpu); + std::vector dweights(weightSize); auto dweights_dev = handle.Write(dweights); miopen::TensorDescriptor weightDesc(miopen::deref(rnnDesc).dataType, {weightSize}); @@ -1452,10 +1477,10 @@ std::vector verify_backward_weights_lstm::gpu() const dy_dev.get(), &weightDesc, dweights_dev.get(), - workSpace_dev.get(), - workSpace.size() * sizeof(T), - reserveSpace_dev.get(), - reserveSpace_gpu.size() * sizeof(T)); + wspace.ptr(), + wspace.size(), + rspace.ptr(), + rspace.size()); #if(MIO_RNN_TIME_EVERYTHING == 1) auto t_end = std::chrono::high_resolution_clock::now(); @@ -1732,10 +1757,10 @@ struct lstm_basic_driver : test_driver false, usePadding); - size_t workSpaceSize; - miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); + size_t workspace_size; + miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workspace_size); - size_t total_mem = statesSizeInBytes + reserveSpaceSize + workSpaceSize + + size_t total_mem = statesSizeInBytes + reserveSpaceSize + workspace_size + (2 * out_sz + in_sz + wei_sz + (nohx ? 0 : hx_sz) + (nohy ? 0 : hx_sz) + (nodhx ? 0 : hx_sz) + (nodhy ? 0 : hx_sz) + (nocx ? 0 : hx_sz) + (nocy ? 0 : hx_sz) + (nodcx ? 0 : hx_sz) + (nodcy ? 0 : hx_sz)) * @@ -1815,7 +1840,7 @@ struct lstm_basic_driver : test_driver #if(MIO_LSTM_TEST_DEBUG > 0) printf("Running backward weights LSTM.\n"); printf("reserve sz: %d, workSpace sz: %d, weight sz: %d\n", - reserveSpaceBwdData.size(), + rsvcpu.size(), workSpaceBwdData.size(), wei_sz); fflush(nullptr); diff --git a/test/main.cpp b/test/main.cpp index b48705754d..a1efb2eff8 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -23,8 +23,10 @@ * SOFTWARE. * *******************************************************************************/ +#include "get_handle.hpp" #include "test.hpp" #include "random.hpp" +#include "workspace.hpp" #include #include #include @@ -37,17 +39,8 @@ struct handle_fixture { miopenHandle_t handle{}; -#if MIOPEN_BACKEND_OPENCL - cl_command_queue q{}; -#endif - handle_fixture() - { - miopenCreate(&handle); -#if MIOPEN_BACKEND_OPENCL - miopenGetStream(handle, &q); -#endif - } + handle_fixture() { miopenCreate(&handle); } ~handle_fixture() { miopenDestroy(handle); } }; @@ -167,8 +160,6 @@ struct conv_forward : output_tensor_fixture { float alpha = 1, beta = 0; - // Setup OpenCL buffers - int n, h, c, w; STATUS(miopenGet4dTensorDescriptorLengths(inputTensor, &n, &c, &h, &w)); size_t sz_in = static_cast(n) * c * h * w; @@ -182,15 +173,12 @@ struct conv_forward : output_tensor_fixture size_t sz_fwd_workspace; STATUS(miopenConvolutionForwardGetWorkSpaceSize( handle, convFilter, inputTensor, convDesc, outputTensor, &sz_fwd_workspace)); - // OCL fails to allocate zero workspace. Let's allocate small workspace instead to simplify - // subsequent code. - if(sz_fwd_workspace == 0) - sz_fwd_workspace = 256; + + Workspace wspace{sz_fwd_workspace}; std::vector in(sz_in); std::vector wei(sz_wei); std::vector out(sz_out); - std::vector fwd_workspace(sz_fwd_workspace / 4); for(size_t i = 0; i < sz_in; i++) { @@ -201,60 +189,16 @@ struct conv_forward : output_tensor_fixture wei[i] = prng::gen_A_to_B(-0.5f, 0.5f) * 0.001f; } -#if MIOPEN_BACKEND_OPENCL - - cl_context ctx; - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); - - cl_int status = CL_SUCCESS; - cl_mem in_dev = clCreateBuffer(ctx, CL_MEM_READ_ONLY, 4 * sz_in, nullptr, &status); - cl_mem wei_dev = clCreateBuffer(ctx, CL_MEM_READ_ONLY, 4 * sz_wei, nullptr, nullptr); - cl_mem out_dev = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 4 * sz_out, nullptr, nullptr); - cl_mem fwd_workspace_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sz_fwd_workspace, nullptr, nullptr); - - status = - clEnqueueWriteBuffer(q, in_dev, CL_TRUE, 0, 4 * sz_in, in.data(), 0, nullptr, nullptr); - status |= clEnqueueWriteBuffer( - q, wei_dev, CL_TRUE, 0, 4 * sz_wei, wei.data(), 0, nullptr, nullptr); - status |= clEnqueueWriteBuffer( - q, out_dev, CL_TRUE, 0, 4 * sz_out, out.data(), 0, nullptr, nullptr); - status |= clEnqueueWriteBuffer(q, - fwd_workspace_dev, - CL_TRUE, - 0, - sz_fwd_workspace, - fwd_workspace.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); - -#elif MIOPEN_BACKEND_HIP - - void* in_dev; - void* wei_dev; - void* out_dev; - void* fwd_workspace_dev; - - EXPECT(hipMalloc(&in_dev, 4 * sz_in) == hipSuccess); - EXPECT(hipMalloc(&wei_dev, 4 * sz_wei) == hipSuccess); - EXPECT(hipMalloc(&out_dev, 4 * sz_out) == hipSuccess); - EXPECT(hipMalloc(&fwd_workspace_dev, sz_fwd_workspace) == hipSuccess); - - EXPECT(hipMemcpy(in_dev, in.data(), 4 * sz_in, hipMemcpyHostToDevice) == hipSuccess); - EXPECT(hipMemcpy(wei_dev, wei.data(), 4 * sz_wei, hipMemcpyHostToDevice) == hipSuccess); - EXPECT(hipMemcpy(out_dev, out.data(), 4 * sz_out, hipMemcpyHostToDevice) == hipSuccess); - EXPECT(hipMemcpy(fwd_workspace_dev, - fwd_workspace.data(), - sz_fwd_workspace, - hipMemcpyHostToDevice) == hipSuccess); + auto& mhand = get_handle(); + + auto in_dev = mhand.Write(in); + auto wei_dev = mhand.Write(wei); + auto out_dev = mhand.Write(out); -#endif int value = 10; - STATUS(miopenSetTensor(handle, inputTensor, in_dev, &value)); + STATUS(miopenSetTensor(handle, inputTensor, in_dev.get(), &value)); - STATUS(miopenScaleTensor(handle, inputTensor, in_dev, &alpha)); + STATUS(miopenScaleTensor(handle, inputTensor, in_dev.get(), &alpha)); float time; @@ -276,32 +220,32 @@ struct conv_forward : output_tensor_fixture STATUS(miopenFindConvolutionForwardAlgorithm( used_handle, inputTensor, - in_dev, + in_dev.get(), convFilter, - wei_dev, + wei_dev.get(), convDesc, outputTensor, - out_dev, + out_dev.get(), 1, &ret_algo_count, &perf, - fwd_workspace_dev, - sz_fwd_workspace, + wspace.ptr(), + wspace.size(), 0)); // MD: Not performing exhaustiveSearch by default for now STATUS(miopenConvolutionForward(used_handle, &alpha, inputTensor, - in_dev, + in_dev.get(), convFilter, - wei_dev, + wei_dev.get(), convDesc, perf.fwd_algo, &beta, outputTensor, - out_dev, - fwd_workspace_dev, - sz_fwd_workspace)); + out_dev.get(), + wspace.ptr(), + wspace.size())); STATUS(miopenGetKernelTime(used_handle, &time)); @@ -318,20 +262,6 @@ struct conv_forward : output_tensor_fixture { CHECK(time == 0.0); } - -// Potential memory leak free memory at end of function -#if MIOPEN_BACKEND_OPENCL - clReleaseMemObject(in_dev); - clReleaseMemObject(wei_dev); - clReleaseMemObject(out_dev); - clReleaseMemObject(fwd_workspace_dev); - -#elif MIOPEN_BACKEND_HIP - hipFree(in_dev); - hipFree(wei_dev); - hipFree(out_dev); - hipFree(fwd_workspace_dev); -#endif } }; diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index ae55b2e133..33bcb7164f 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -46,6 +46,7 @@ #include "tensor_holder.hpp" #include "verify.hpp" #include "cpu_conv.hpp" +#include "workspace.hpp" #define TEST_PADDING_MODE 0 // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) @@ -199,9 +200,10 @@ struct verify_forward_pooling auto out = get_output_tensor(filter, input); indices.resize(out.data.size(), 0); - auto in_dev = handle.Write(input.data); - auto out_dev = handle.Create(out.data.size()); - auto workspace_dev = handle.Write(indices); + auto in_dev = handle.Write(input.data); + auto out_dev = handle.Create(out.data.size()); + Workspace wspace{}; + wspace.Write(indices); float alpha = 1, beta = 0; filter.Forward(handle, @@ -212,10 +214,10 @@ struct verify_forward_pooling out.desc, out_dev.get(), true, - workspace_dev.get(), - indices.size() * sizeof(Index)); + wspace.ptr(), + wspace.size()); - indices = handle.Read(workspace_dev, indices.size()); + indices = wspace.Read>(); out.data = handle.Read(out_dev, out.data.size()); return out; } @@ -403,9 +405,8 @@ struct verify_backward_pooling auto out_dev = handle.Write(out.data); auto din_dev = handle.Create(dinput.data.size()); - // std::vector workspace(filter.GetWorkSpaceSize(out.desc)); - // auto workspace_dev = handle.Write(workspace); - auto workspace_dev = handle.Write(indices); + Workspace wspace{}; + wspace.Write(indices); float alpha = 1, beta = 0; filter.Backward(handle, @@ -423,7 +424,7 @@ struct verify_backward_pooling // dx dinput.desc, din_dev.get(), - workspace_dev.get()); + wspace.ptr()); dinput.data = handle.Read(din_dev, dinput.data.size()); return dinput; diff --git a/test/reduce_test.cpp b/test/reduce_test.cpp index 3ca771fc64..dc92a20318 100644 --- a/test/reduce_test.cpp +++ b/test/reduce_test.cpp @@ -27,6 +27,7 @@ #include "driver.hpp" #include "test.hpp" #include "verify.hpp" +#include "workspace.hpp" #include "get_handle.hpp" #include "tensor_holder.hpp" #include "random.hpp" @@ -60,28 +61,26 @@ struct verify_reduce_with_indices miopenReduceTensorIndices_t indicesOpt; miopenIndicesType_t indicesType; - verify_reduce_with_indices( // NOLINT (hicpp-member-init) - const miopen::ReduceTensorDescriptor& reduce_, - const tensor& input_, - const tensor& output_, - const tensor& workspace_, - const tensor& indices_, - float alpha_, - float beta_) + verify_reduce_with_indices(const miopen::ReduceTensorDescriptor& reduce_, + const tensor& input_, + const tensor& output_, + const tensor& workspace_, + const tensor& indices_, + float alpha_, + float beta_) + : reduce(reduce_), + input(input_), + output(output_), + workspace(workspace_), + indices(indices_), + alpha(alpha_), + beta(beta_), + reduceOp(reduce.reduceTensorOp_), + compTypeVal(reduce.reduceTensorCompType_), + nanOpt(reduce.reduceTensorNanOpt_), + indicesOpt(reduce.reduceTensorIndices_), + indicesType(reduce.reduceTensorIndicesType_) { - reduce = reduce_; - input = input_; - output = output_; - workspace = workspace_; - indices = indices_; - alpha = alpha_; - beta = beta_; - - reduceOp = reduce.reduceTensorOp_; - compTypeVal = reduce.reduceTensorCompType_; - nanOpt = reduce.reduceTensorNanOpt_; - indicesOpt = reduce.reduceTensorIndices_; - indicesType = reduce.reduceTensorIndicesType_; } tensor cpu() const @@ -343,10 +342,11 @@ struct verify_reduce_with_indices auto res = output; auto res_indices = indices; - auto indices_dev = handle.Write(indices.data); + Workspace idxspace{}; + idxspace.Write(indices.data); - std::size_t ws_sizeInBytes = workspace.desc.GetElementSize() * sizeof(T); - std::size_t indices_sizeInBytes = indices.desc.GetElementSize() * sizeof(int); + Workspace wspace{}; + wspace.Write(workspace.data); const double alpha64 = alpha; const double beta64 = beta; @@ -358,15 +358,13 @@ struct verify_reduce_with_indices ? static_cast(&beta64) : static_cast(&beta); - if(ws_sizeInBytes > 0) + if(wspace.size() > 0) { - auto workspace_dev = handle.Write(workspace.data); - reduce.ReduceTensor(get_handle(), - indices_dev.get(), - indices_sizeInBytes, - workspace_dev.get(), - ws_sizeInBytes, + idxspace.ptr(), + idxspace.size(), + wspace.ptr(), + wspace.size(), alphaPtr, input.desc, input_dev.get(), @@ -377,8 +375,8 @@ struct verify_reduce_with_indices else { reduce.ReduceTensor(get_handle(), - indices_dev.get(), - indices_sizeInBytes, + idxspace.ptr(), + idxspace.size(), nullptr, 0, alphaPtr, @@ -390,7 +388,7 @@ struct verify_reduce_with_indices }; res.data = handle.Read(output_dev, res.data.size()); - res_indices.data = handle.Read(indices_dev, res_indices.data.size()); + res_indices.data = idxspace.Read(); return (std::make_tuple(res, res_indices)); } @@ -647,7 +645,8 @@ struct verify_reduce_no_indices // replicate auto res = output; - std::size_t ws_sizeInBytes = workspace.desc.GetElementSize() * sizeof(T); + Workspace wspace{}; + wspace.Write(workspace.data); const double alpha64 = alpha; const double beta64 = beta; @@ -659,15 +658,13 @@ struct verify_reduce_no_indices ? static_cast(&beta64) : static_cast(&beta); - if(ws_sizeInBytes > 0) + if(wspace.size() > 0) { - auto workspace_dev = handle.Write(workspace.data); - reduce.ReduceTensor(get_handle(), nullptr, 0, - workspace_dev.get(), - ws_sizeInBytes, + wspace.ptr(), + wspace.size(), alphaPtr, input.desc, input_dev.get(), diff --git a/test/rnn_vanilla_common.hpp b/test/rnn_vanilla_common.hpp index ef5c1088c3..854f682068 100644 --- a/test/rnn_vanilla_common.hpp +++ b/test/rnn_vanilla_common.hpp @@ -35,6 +35,7 @@ #include "verify.hpp" #include "rnn_util.hpp" #include "random.hpp" +#include "workspace.hpp" #include #include #include @@ -1436,8 +1437,7 @@ struct verify_forward_infer_rnn miopen::deref(rnnDesc).dataType); miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); - - std::vector workSpace(workSpaceSize / sizeof(T)); + Workspace wspace{workSpaceSize}; auto input_dev = handle.Write(input); @@ -1450,8 +1450,6 @@ struct verify_forward_infer_rnn std::fill(hy.begin(), hy.end(), 0.); auto hy_dev = handle.Write(hy); - auto workSpace_dev = handle.Write(workSpace); - std::vector hlens(3, 0); hlens[0] = nLayers * ((dirMode != 0) ? 2 : 1); hlens[1] = batch_seq[0]; @@ -1479,8 +1477,8 @@ struct verify_forward_infer_rnn ((nohy) ? nullptr : hy_dev.get()), &hiddenDesc, nullptr, - workSpace_dev.get(), - workSpaceSize); + wspace.ptr(), + wspace.size()); #if(MIO_RNN_TEST_DEBUG == 2) auto outdata = handle.Read(output_dev, output.size()); @@ -1708,11 +1706,13 @@ struct verify_forward_train_rnn miopen::deref(rnnDesc).dataType); miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); + Workspace wspace{workSpaceSize}; + miopenGetRNNTrainingReserveSize( &handle, rnnDesc, seqLength, inputDescs.data(), &reserveSpaceSize); + reserveSpaceSize = (reserveSpaceSize + (sizeof(T) - 1)) & ~(sizeof(T) - 1); - std::vector workSpace(workSpaceSize / sizeof(T)); - std::vector reserveSpace((reserveSpaceSize + sizeof(T) - 1) / sizeof(T)); + Workspace rspace{reserveSpaceSize}; auto input_dev = handle.Write(input); @@ -1726,9 +1726,6 @@ struct verify_forward_train_rnn std::fill(hy.begin(), hy.end(), 0.); auto hy_dev = handle.Write(hy); - auto workSpace_dev = handle.Write(workSpace); - auto reserveSpace_dev = handle.Write(reserveSpace); - std::vector hlens(3, 0); hlens[0] = nLayers * ((dirMode != 0) ? 2 : 1); hlens[1] = batch_seq[0]; @@ -1756,10 +1753,10 @@ struct verify_forward_train_rnn ((nohy) ? nullptr : hy_dev.get()), &hiddenDesc, nullptr, - workSpace_dev.get(), - workSpaceSize, - reserveSpace_dev.get(), - reserveSpaceSize); + wspace.ptr(), + wspace.size(), + rspace.ptr(), + rspace.size()); #if(MIO_RNN_TEST_DEBUG == 2) auto outdata = handle.Read(output_dev, output.size()); @@ -1769,10 +1766,9 @@ struct verify_forward_train_rnn } #endif - auto retSet = std::make_tuple( - handle.Read(output_dev, output.size()), - (nohy ? initHidden : handle.Read(hy_dev, hy.size())), - handle.Read(reserveSpace_dev, (reserveSpaceSize + sizeof(T) - 1) / sizeof(T))); + auto retSet = std::make_tuple(handle.Read(output_dev, output.size()), + (nohy ? initHidden : handle.Read(hy_dev, hy.size())), + rspace.Read>()); #if(MIO_RNN_TIME_EVERYTHING == 1) auto t_end = std::chrono::high_resolution_clock::now(); @@ -2002,15 +1998,15 @@ struct verify_backward_data_rnn miopen::deref(rnnDesc).dataType); miopenGetRNNWorkspaceSize(&handle, rnnDesc, seqLength, inputDescs.data(), &workSpaceSize); - std::vector workSpace(workSpaceSize / sizeof(T)); - auto workSpace_dev = handle.Write(workSpace); + Workspace wspace{workSpaceSize}; miopenGetRNNInputTensorSize(&handle, rnnDesc, seqLength, outputDescs.data(), &out_sz); auto yin_dev = handle.Write(yin); auto dyin_dev = handle.Write(dy); // auto dhyin_dev = handle.Write(dhy); - auto reserveSpace_dev = handle.Write(reserveSpace); - auto weights_dev = handle.Write(weights); + Workspace rspace{}; + rspace.Write(reserveSpace); + auto weights_dev = handle.Write(weights); // auto hx_dev = handle.Write(initHidden); std::vector hlens(3, 0); @@ -2054,15 +2050,15 @@ struct verify_backward_data_rnn ((nodhx) ? nullptr : dhx_dev.get()), &hiddenDesc, nullptr, - workSpace_dev.get(), - workSpaceSize, - reserveSpace_dev.get(), - reserveSpace.size() * sizeof(T)); + wspace.ptr(), + wspace.size(), + rspace.ptr(), + rspace.size()); auto retSet = std::make_tuple(handle.Read(dx_dev, dx.size()), (nodhx ? initHidden : handle.Read(dhx_dev, dhx.size())), - handle.Read(reserveSpace_dev, reserveSpace.size()), - handle.Read(workSpace_dev, workSpace.size())); + rspace.Read>(), + wspace.Read>()); #if(MIO_RNN_TIME_EVERYTHING == 1) auto t_end = std::chrono::high_resolution_clock::now(); @@ -2257,8 +2253,11 @@ struct verify_backward_weights_rnn hiddenSize * ((dirMode != 0) ? 2 : 1), miopen::deref(rnnDesc).dataType); - auto workSpace_dev = handle.Write(workSpace); - auto reserveSpace_dev = handle.Write(reserveSpace); + Workspace wspace{}; + wspace.Write(workSpace); + Workspace rspace{}; + rspace.Write(reserveSpace); + std::vector dweights(weightSize); auto dweights_dev = handle.Write(dweights); miopen::TensorDescriptor weightDesc(miopen::deref(rnnDesc).dataType, {weightSize}); @@ -2283,10 +2282,10 @@ struct verify_backward_weights_rnn dy_dev.get(), &weightDesc, dweights_dev.get(), - workSpace_dev.get(), - workSpace.size() * sizeof(T), - reserveSpace_dev.get(), - reserveSpace.size() * sizeof(T)); + wspace.ptr(), + wspace.size(), + rspace.ptr(), + rspace.size()); #if(MIO_RNN_TIME_EVERYTHING == 1) auto t_end = std::chrono::high_resolution_clock::now(); diff --git a/test/tensor_reorder.cpp b/test/tensor_reorder.cpp index dc1a38f508..bf40e7ee38 100644 --- a/test/tensor_reorder.cpp +++ b/test/tensor_reorder.cpp @@ -38,6 +38,8 @@ #include "test.hpp" #include "driver.hpp" #include "random.hpp" +#include "get_handle.hpp" +#include "workspace.hpp" template <> struct miopen_type : std::integral_constant @@ -286,19 +288,6 @@ bool verify_tensor(tensor& t_gpu, tensor& t_cpu) struct tensor_reorder_base_driver : test_driver { - miopenHandle_t handle{}; -#if MIOPEN_BACKEND_OPENCL - cl_command_queue q{}; -#endif - - tensor_reorder_base_driver() - { - miopenCreate(&handle); -#if MIOPEN_BACKEND_OPENCL - miopenGetStream(handle, &q); -#endif - } - ~tensor_reorder_base_driver() { miopenDestroy(handle); } static std::vector get_dim_3_size() { return {1, 9}; } static std::vector get_dim_2_size() { return {1, 9}; } @@ -363,14 +352,14 @@ struct tensor_reorder_driver : tensor_reorder_base_driver // NOLINTBEGIN(clang-analyzer-cplusplus.NewDeleteLeaks) void run() { - auto run_reorder = [this](uint32_t dim_0, - uint32_t dim_1, - uint32_t dim_2, - uint32_t dim_3, - uint32_t order_0, - uint32_t order_1, - uint32_t order_2, - uint32_t order_3) { + auto run_reorder = [](uint32_t dim_0, + uint32_t dim_1, + uint32_t dim_2, + uint32_t dim_3, + uint32_t order_0, + uint32_t order_1, + uint32_t order_2, + uint32_t order_3) { int tensor_sz = dim_0 * dim_1 * dim_2 * dim_3; std::vector tensor_len({static_cast(dim_0), static_cast(dim_1), @@ -392,8 +381,9 @@ struct tensor_reorder_driver : tensor_reorder_base_driver tensor t_dst_gpu(tensor_len, tensor_strides); rand_tensor_integer(t_src); + auto& handle = get_handle(); miopen::ExecutionContext ctx; - ctx.SetStream(&miopen::deref(this->handle)); + ctx.SetStream(&handle); // ctx.SetupFloats(); auto reorder_sol = MakeTensorReorderAttributes(ctx, to_miopen_data_type::get(), @@ -406,36 +396,13 @@ struct tensor_reorder_driver : tensor_reorder_base_driver order_2, order_3); EXPECT(reorder_sol != nullptr); - size_t workspace = reorder_sol->IsSkippable() ? sizeof(T) * tensor_sz - : reorder_sol->GetOutputTensorSize(); -#if MIOPEN_BACKEND_OPENCL - cl_context cl_ctx; - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &cl_ctx, nullptr); - cl_int status = CL_SUCCESS; - cl_mem src_dev = - clCreateBuffer(cl_ctx, CL_MEM_READ_WRITE, sizeof(T) * tensor_sz, nullptr, &status); - cl_mem dst_dev = clCreateBuffer(cl_ctx, CL_MEM_READ_WRITE, workspace, nullptr, nullptr); - status |= clEnqueueWriteBuffer(q, - src_dev, - CL_TRUE, - 0, - sizeof(T) * tensor_sz, - t_src.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP - void* src_dev; - void* dst_dev; - EXPECT(hipMalloc(&src_dev, sizeof(T) * tensor_sz) == hipSuccess); - EXPECT(hipMalloc(&dst_dev, workspace) == hipSuccess); - EXPECT(hipMemcpy( - src_dev, t_src.data.data(), sizeof(T) * tensor_sz, hipMemcpyHostToDevice) == - hipSuccess); -#endif - const auto invoke_param = reorder_invoke_param{ - DataCast(static_cast(src_dev)), DataCast(dst_dev)}; + size_t workspace_size = reorder_sol->IsSkippable() ? sizeof(T) * tensor_sz + : reorder_sol->GetOutputTensorSize(); + Workspace wspace{workspace_size}; + + auto src_dev = handle.Write(t_src.data); + + const auto invoke_param = reorder_invoke_param{src_dev.get(), wspace.ptr()}; std::vector opArgs = reorder_sol->GetKernelArg(); boost::optional invoker_factory( [=](const std::vector& kernels) mutable { @@ -451,10 +418,9 @@ struct tensor_reorder_driver : tensor_reorder_base_driver }); std::vector construction_params{ reorder_sol->GetKernelInfo()}; - const auto invoker = - miopen::deref(this->handle).PrepareInvoker(*invoker_factory, construction_params); + const auto invoker = handle.PrepareInvoker(*invoker_factory, construction_params); // run gpu - invoker(miopen::deref(this->handle), invoke_param); + invoker(handle, invoke_param); // run cpu cpu_reorder::run(t_dst.data.data(), t_src.data.data(), @@ -467,18 +433,9 @@ struct tensor_reorder_driver : tensor_reorder_base_driver order_2, order_3); invoker_factory = boost::none; -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer( - q, dst_dev, CL_TRUE, 0, workspace, t_dst_gpu.data.data(), 0, nullptr, nullptr); - EXPECT(status == CL_SUCCESS); - clReleaseMemObject(dst_dev); - clReleaseMemObject(src_dev); -#elif MIOPEN_BACKEND_HIP - EXPECT(hipMemcpy(t_dst_gpu.data.data(), dst_dev, workspace, hipMemcpyDeviceToHost) == - hipSuccess); - hipFree(dst_dev); - hipFree(src_dev); -#endif + + t_dst_gpu.data = wspace.Read(); + // we expect excact match, since use integer bool valid_result = verify_tensor(t_dst_gpu, t_dst); std::cout << "[" << reorder_str::get(order_0, order_1, order_2, order_3) << ", b" diff --git a/test/workspace.hpp b/test/workspace.hpp new file mode 100644 index 0000000000..93522b1cd4 --- /dev/null +++ b/test/workspace.hpp @@ -0,0 +1,163 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +#include "get_handle.hpp" + +#define HIP_CHECK(exp) \ + if((exp) != hipSuccess) \ + { \ + MIOPEN_LOG_E(#exp "failed at line: " << __LINE__ << " in file: " << __FILE__); \ + } + +class Workspace +{ + + // RAII class for hip allocations + class GPUBuffer + { + public: + GPUBuffer() = default; + + explicit GPUBuffer(size_t num_bytes) : sz_(num_bytes) + { + if(num_bytes > 0) + { + HIP_CHECK(hipMalloc(&buf_, num_bytes)); + assert(buf_ != nullptr); + } + else + { + buf_ = nullptr; + } + } + + ~GPUBuffer() { FreeBuf(); } + + void* ptr() { return buf_; } + void* ptr() const { return buf_; } + + auto size() const { return sz_; } + + GPUBuffer(const GPUBuffer&) = delete; + GPUBuffer& operator=(const GPUBuffer&) = delete; + + GPUBuffer(GPUBuffer&& that) noexcept : buf_(that.buf_), sz_(that.sz_) + { + that.buf_ = nullptr; // take over ownership + that.sz_ = 0; + } + + GPUBuffer& operator=(GPUBuffer&& that) noexcept + { + FreeBuf(); + std::swap(this->buf_, that.buf_); + std::swap(this->sz_, that.sz_); + return *this; + } + + private: + void FreeBuf() + { + HIP_CHECK(hipFree(buf_)); + buf_ = nullptr; + sz_ = 0; + } + + void* buf_ = nullptr; + size_t sz_ = 0; + }; + + // for use in miopen .*GetWorkSpaceSize() methods where a pointer to size_t is + // passed to capture the size. Must call AdjustToSize() after calling such a method + size_t* SizePtr() { return &sz_; } + + void AdjustToSize() + { + if(sz_ != gpu_buf_.size()) + { + gpu_buf_ = GPUBuffer(sz_); + } + } + +public: + explicit Workspace(size_t sz = 0) : sz_(sz) { AdjustToSize(); } + + Workspace(const Workspace&) = delete; + Workspace& operator=(const Workspace&) = delete; + Workspace(Workspace&&) = default; + Workspace& operator=(Workspace&&) = default; + + size_t size() const { return sz_; } + + void resize(size_t sz_in_bytes) + { + sz_ = sz_in_bytes; + AdjustToSize(); + } + + auto ptr() const { return gpu_buf_.ptr(); } + + auto ptr() { return gpu_buf_.ptr(); } + + template + void Write(const V& vec) + { + using T = typename V::value_type; + resize(vec.size() * sizeof(T)); + HIP_CHECK(hipMemcpy(this->ptr(), &vec[0], size(), hipMemcpyHostToDevice)); + } + + template + void ReadTo(V& vec) const + { + using T = typename V::value_type; + if(vec.size() * sizeof(T) != size()) + { + MIOPEN_LOG_E("vector of wrong size passed"); + std::abort(); + } + HIP_CHECK(hipMemcpy(&vec[0], ptr(), size(), hipMemcpyDeviceToHost)); + } + + template + V Read() const + { + using T = typename V::value_type; + size_t num_elem = size() / sizeof(T); + V ret(num_elem); + ReadTo(ret); + return ret; + } + +private: + // miopen::Handle& handle_; + // miopen::Allocator::ManageDataPtr data_{}; + GPUBuffer gpu_buf_{}; + size_t sz_{}; +};