From 7e99f63c8374b906be3783084cb14415cdd90870 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 22 Aug 2023 11:59:43 +0200 Subject: [PATCH] Redefine pitches This PR shifts the values returned from getPitchesInBytes to be consistent with std::mdspan (except in bytes). Example: the pitch vector for the extent {42, 10, 2} changes: Before: {4, 3360, 80, 8} After: {80, 8, 4} The new meaning is that the pitch value is the number of bytes to jump from one element to the next in the given dimension. Fixes: #2083 --- example/bufferCopy/src/bufferCopy.cpp | 8 +- example/randomCells2D/src/randomCells2D.cpp | 12 +- include/alpaka/idx/MapIdx.hpp | 25 ++-- .../alpaka/mem/buf/BufUniformCudaHipRt.hpp | 12 +- include/alpaka/mem/buf/cpu/Copy.hpp | 32 +++-- include/alpaka/mem/buf/cpu/Set.hpp | 19 ++- .../alpaka/mem/buf/uniformCudaHip/Copy.hpp | 65 +++++----- include/alpaka/mem/buf/uniformCudaHip/Set.hpp | 12 +- include/alpaka/mem/view/Traits.hpp | 60 ++++----- include/alpaka/mem/view/ViewAccessOps.hpp | 10 +- include/alpaka/mem/view/ViewSubView.hpp | 119 +++++------------- include/alpaka/test/mem/view/Iterator.hpp | 52 ++------ include/alpaka/test/mem/view/ViewTest.hpp | 11 +- test/integ/mandelbrot/src/mandelbrot.cpp | 25 ++-- test/integ/matMul/src/matMul.cpp | 30 +++-- test/unit/mem/buf/src/BufTest.cpp | 22 +--- test/unit/mem/view/src/ViewSubViewTest.cpp | 31 ++--- 17 files changed, 201 insertions(+), 344 deletions(-) diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp index 692cfa83e4d4..f56fd21aeeb1 100644 --- a/example/bufferCopy/src/bufferCopy.cpp +++ b/example/bufferCopy/src/bufferCopy.cpp @@ -215,10 +215,10 @@ auto main() -> int // padding between rows/planes of multidimensional memory allocations. // Therefore the pitch (distance between consecutive rows/planes) may be // greater than the space required for the data. - Idx const deviceBuffer1Pitch(alpaka::getPitchesInBytes(deviceBuffer1)[2] / sizeof(Data)); - Idx const deviceBuffer2Pitch(alpaka::getPitchesInBytes(deviceBuffer2)[2] / sizeof(Data)); - Idx const hostBuffer1Pitch(alpaka::getPitchesInBytes(hostBuffer)[2] / sizeof(Data)); - Idx const hostViewPlainPtrPitch(alpaka::getPitchesInBytes(hostViewPlainPtr)[2] / sizeof(Data)); + Idx const deviceBuffer1Pitch(alpaka::getPitchesInBytes(deviceBuffer1)[1] / sizeof(Data)); + Idx const deviceBuffer2Pitch(alpaka::getPitchesInBytes(deviceBuffer2)[1] / sizeof(Data)); + Idx const hostBuffer1Pitch(alpaka::getPitchesInBytes(hostBuffer)[1] / sizeof(Data)); + Idx const hostViewPlainPtrPitch(alpaka::getPitchesInBytes(hostViewPlainPtr)[1] / sizeof(Data)); // Test device Buffer // diff --git a/example/randomCells2D/src/randomCells2D.cpp b/example/randomCells2D/src/randomCells2D.cpp index 04d27b888ae8..4db4475adf1b 100644 --- a/example/randomCells2D/src/randomCells2D.cpp +++ b/example/randomCells2D/src/randomCells2D.cpp @@ -201,16 +201,16 @@ auto main() -> int RandomEngineVector* const ptrBufAccRandV{alpaka::getPtrNative(bufAccRandV)}; InitRandomKernel initRandomKernel; - auto pitchBufAccRandS = alpaka::getPitchesInBytes(bufAccRandS)[1]; + auto pitchBufAccRandS = alpaka::getPitchesInBytes(bufAccRandS)[0]; alpaka::exec(queue, workdiv, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS); alpaka::wait(queue); - auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[1]; + auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[0]; alpaka::exec(queue, workdiv, initRandomKernel, extent, ptrBufAccRandV, pitchBufAccRandV); alpaka::wait(queue); - auto pitchHostS = alpaka::getPitchesInBytes(bufHostS)[1]; - auto pitchHostV = alpaka::getPitchesInBytes(bufHostV)[1]; + auto pitchHostS = alpaka::getPitchesInBytes(bufHostS)[0]; + auto pitchHostV = alpaka::getPitchesInBytes(bufHostV)[0]; for(Idx y = 0; y < numY; ++y) { @@ -221,7 +221,7 @@ auto main() -> int } } - auto pitchBufAccS = alpaka::getPitchesInBytes(bufAccS)[1]; + auto pitchBufAccS = alpaka::getPitchesInBytes(bufAccS)[0]; alpaka::memcpy(queue, bufAccS, bufHostS); RunTimestepKernelSingle runTimestepKernelSingle; alpaka::exec( @@ -235,7 +235,7 @@ auto main() -> int pitchBufAccS); alpaka::memcpy(queue, bufHostS, bufAccS); - auto pitchBufAccV = alpaka::getPitchesInBytes(bufAccV)[1]; + auto pitchBufAccV = alpaka::getPitchesInBytes(bufAccV)[0]; alpaka::memcpy(queue, bufAccV, bufHostV); RunTimestepKernelVector runTimestepKernelVector; alpaka::exec( diff --git a/include/alpaka/idx/MapIdx.hpp b/include/alpaka/idx/MapIdx.hpp index 46f86458625f..b808b73d4656 100644 --- a/include/alpaka/idx/MapIdx.hpp +++ b/include/alpaka/idx/MapIdx.hpp @@ -5,6 +5,7 @@ #pragma once #include "alpaka/core/Common.hpp" +#include "alpaka/vec/Traits.hpp" #include "alpaka/vec/Vec.hpp" #include @@ -136,10 +137,11 @@ namespace alpaka namespace detail { - //! Maps a linear index to a N dimensional index assuming a buffer wihtout padding. + //! Maps a linear index to a N dimensional index assuming a buffer without padding. template struct MapIdxPitchBytes; - //! Maps a N dimensional index to the same N dimensional index assuming a buffer wihtout padding. + + //! Maps a N dimensional index to the same N dimensional index assuming a buffer without padding. template struct MapIdxPitchBytes { @@ -156,7 +158,7 @@ namespace alpaka return idx; } }; - //! Maps a 1 dimensional index to a N dimensional index assuming a buffer wihtout padding. + //! Maps a 1 dimensional index to a N dimensional index assuming a buffer without padding. template struct MapIdxPitchBytes 1u)>> { @@ -170,15 +172,15 @@ namespace alpaka Vec, TElem> const& idx, Vec, TElem> const& pitch) -> Vec, TElem> { - auto idxNd = Vec, TElem>::all(0u); + auto idxNd = Vec, TElem>::zeros(); constexpr std::size_t lastIdx = TidxDimOut - 1u; TElem tmp = idx[0u]; for(std::size_t d(0u); d < lastIdx; ++d) { - idxNd[d] = static_cast(tmp / pitch[d + 1]); - tmp %= pitch[d + 1]; + idxNd[d] = static_cast(tmp / pitch[d]); + tmp %= pitch[d]; } idxNd[lastIdx] = tmp; @@ -199,13 +201,8 @@ namespace alpaka Vec, TElem> const& idx, Vec, TElem> const& pitch) -> Vec, TElem> { - constexpr auto lastDim = TidxDimIn - 1; - TElem idx1d = idx[lastDim]; - for(std::size_t d(0u); d < lastDim; ++d) - { - idx1d = static_cast(idx1d + pitch[d + 1] * idx[d]); - } - return {idx1d}; + using DimMinusOne = DimInt; + return {idx.back() + (subVecBegin(pitch) * subVecBegin(idx)).sum()}; } }; @@ -234,7 +231,7 @@ namespace alpaka }; } // namespace detail - //! Maps a N dimensional index to a N dimensional position based on + //! Maps an N dimensional index to a N dimensional position based on //! pitch in a buffer without padding or a byte buffer. //! //! \tparam TidxDimOut Dimension of the index vector to map to. diff --git a/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp b/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp index d56dde8d551d..554077562aaf 100644 --- a/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp +++ b/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp @@ -182,16 +182,18 @@ namespace alpaka struct GetPitchesInBytes> { ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt const& buf) const + -> Vec { Vec v{}; if constexpr(TDim::value > 0) { + v.back() = sizeof(TElem); if constexpr(TDim::value > 1) - v.back() = buf.m_rowPitchInBytes; - else - v.back() = buf.m_extentElements.back() * sizeof(TElem); - for(int i = static_cast(TDim::value) - 2; i >= 0; i--) - v[i] = buf.m_extentElements[i] * v[i + 1]; + { + v[TDim::value - 2] = static_cast(buf.m_rowPitchInBytes); + for(TIdx i = TDim::value - 2; i > 0; i--) + v[i - 1] = buf.m_extentElements[i] * v[i]; + } } return v; } diff --git a/include/alpaka/mem/buf/cpu/Copy.hpp b/include/alpaka/mem/buf/cpu/Copy.hpp index 098c9b406049..29c8e7be43b4 100644 --- a/include/alpaka/mem/buf/cpu/Copy.hpp +++ b/include/alpaka/mem/buf/cpu/Copy.hpp @@ -29,6 +29,8 @@ namespace alpaka template struct TaskCopyCpuBase { + static_assert(TDim::value > 0); + using ExtentSize = Idx; using DstSize = Idx; using SrcSize = Idx; @@ -37,7 +39,7 @@ namespace alpaka template TaskCopyCpuBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) : m_extent(getExtents(extent)) - , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast(sizeof(Elem))) + , m_extentWidthBytes(m_extent.back() * static_cast(sizeof(Elem))) #if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) , m_dstExtent(getExtents(viewDst)) , m_srcExtent(getExtents(viewSrc)) @@ -49,10 +51,13 @@ namespace alpaka { if constexpr(TDim::value > 0) { - ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).foldrAll(std::logical_or())); - ALPAKA_ASSERT((castVec(m_extent) <= m_srcExtent).foldrAll(std::logical_or())); - ALPAKA_ASSERT(static_cast(m_extentWidthBytes) <= m_dstPitchBytes[TDim::value - 1u]); - ALPAKA_ASSERT(static_cast(m_extentWidthBytes) <= m_srcPitchBytes[TDim::value - 1u]); + ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).all()); + ALPAKA_ASSERT((castVec(m_extent) <= m_srcExtent).all()); + if constexpr(TDim::value > 1) + { + ALPAKA_ASSERT(static_cast(m_extentWidthBytes) <= m_dstPitchBytes[TDim::value - 2]); + ALPAKA_ASSERT(static_cast(m_extentWidthBytes) <= m_srcPitchBytes[TDim::value - 2]); + } } } @@ -100,10 +105,9 @@ namespace alpaka #endif // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one // iteration. - Vec const extentWithoutInnermost(subVecBegin(this->m_extent)); - // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required. - Vec const dstPitchBytesWithoutOutmost(subVecEnd(this->m_dstPitchBytes)); - Vec const srcPitchBytesWithoutOutmost(subVecEnd(this->m_srcPitchBytes)); + Vec const extentWithoutInnermost = subVecBegin(this->m_extent); + Vec const dstPitchBytesWithoutOutmost = subVecBegin(this->m_dstPitchBytes); + Vec const srcPitchBytesWithoutOutmost = subVecBegin(this->m_srcPitchBytes); if(static_cast(this->m_extent.prod()) != 0u) { @@ -112,14 +116,8 @@ namespace alpaka [&](Vec const& idx) { std::memcpy( - reinterpret_cast( - this->m_dstMemNative - + (castVec(idx) * dstPitchBytesWithoutOutmost) - .foldrAll(std::plus())), - reinterpret_cast( - this->m_srcMemNative - + (castVec(idx) * srcPitchBytesWithoutOutmost) - .foldrAll(std::plus())), + this->m_dstMemNative + (castVec(idx) * dstPitchBytesWithoutOutmost).sum(), + this->m_srcMemNative + (castVec(idx) * srcPitchBytesWithoutOutmost).sum(), static_cast(this->m_extentWidthBytes)); }); } diff --git a/include/alpaka/mem/buf/cpu/Set.hpp b/include/alpaka/mem/buf/cpu/Set.hpp index b75dd4135def..57d50d5b805e 100644 --- a/include/alpaka/mem/buf/cpu/Set.hpp +++ b/include/alpaka/mem/buf/cpu/Set.hpp @@ -23,6 +23,8 @@ namespace alpaka template struct TaskSetCpuBase { + static_assert(TDim::value > 0); + using ExtentSize = Idx; using DstSize = Idx; using Elem = alpaka::Elem; @@ -31,15 +33,16 @@ namespace alpaka TaskSetCpuBase(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent) : m_byte(byte) , m_extent(getExtents(extent)) - , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast(sizeof(Elem))) + , m_extentWidthBytes(m_extent.back() * static_cast(sizeof(Elem))) #if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) , m_dstExtent(getExtents(view)) #endif , m_dstPitchBytes(getPitchesInBytes(view)) , m_dstMemNative(reinterpret_cast(getPtrNative(view))) { - ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).foldrAll(std::logical_or())); - ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]); + ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).all()); + if constexpr(TDim::value > 1) + ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 2]); } #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL @@ -80,9 +83,8 @@ namespace alpaka #endif // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one // iteration. - Vec const extentWithoutInnermost(subVecBegin(this->m_extent)); - // [z, y, x] -> [y, x] because the z pitch (the full idx of the buffer) is not required. - Vec const dstPitchBytesWithoutOutmost(subVecEnd(this->m_dstPitchBytes)); + Vec const extentWithoutInnermost = subVecBegin(this->m_extent); + Vec const dstPitchBytesWithoutOutmost = subVecBegin(this->m_dstPitchBytes); if(static_cast(this->m_extent.prod()) != 0u) { @@ -91,10 +93,7 @@ namespace alpaka [&](Vec const& idx) { std::memset( - reinterpret_cast( - this->m_dstMemNative - + (castVec(idx) * dstPitchBytesWithoutOutmost) - .foldrAll(std::plus())), + this->m_dstMemNative + (castVec(idx) * dstPitchBytesWithoutOutmost).sum(), this->m_byte, static_cast(this->m_extentWidthBytes)); }); diff --git a/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp b/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp index 80ce19ad9dd8..37ee6fb7aa24 100644 --- a/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp +++ b/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp @@ -202,8 +202,8 @@ namespace alpaka , m_dstHeight(static_cast(getHeight(viewDst))) , m_srcHeight(static_cast(getHeight(viewSrc))) # endif - , m_dstPitchBytes(static_cast(getPitchesInBytes(viewDst)[Dim::value - 1u])) - , m_srcPitchBytes(static_cast(getPitchesInBytes(viewSrc)[Dim::value - 1u])) + , m_dstRowPitchBytes(static_cast(getPitchesInBytes(viewDst)[0])) + , m_srcRowPitchBytes(static_cast(getPitchesInBytes(viewSrc)[0])) , m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) { @@ -212,8 +212,8 @@ namespace alpaka ALPAKA_ASSERT(m_extentHeight <= m_dstHeight); ALPAKA_ASSERT(m_extentWidth <= m_srcWidth); ALPAKA_ASSERT(m_extentHeight <= m_srcHeight); - ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes); - ALPAKA_ASSERT(m_extentWidthBytes <= m_srcPitchBytes); + ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes); + ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes); # endif } @@ -236,9 +236,9 @@ namespace alpaka // Initiate the memory copy. ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpy2DAsync( m_dstMemNative, - m_dstPitchBytes, + m_dstRowPitchBytes, m_srcMemNative, - m_srcPitchBytes, + m_srcRowPitchBytes, m_extentWidthBytes, static_cast(m_extentHeight), m_uniformMemCpyKind, @@ -251,9 +251,9 @@ namespace alpaka { std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice << " dw: " << m_dstWidth - << " dh: " << m_dstHeight << " dptr: " << m_dstMemNative << " dpitchb: " << m_dstPitchBytes + << " dh: " << m_dstHeight << " dptr: " << m_dstMemNative << " dpitch: " << m_dstRowPitchBytes << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight - << " sptr: " << m_srcMemNative << " spitchb: " << m_srcPitchBytes << std::endl; + << " sptr: " << m_srcMemNative << " spitch: " << m_srcRowPitchBytes << std::endl; } # endif @@ -272,8 +272,8 @@ namespace alpaka Idx m_dstHeight; Idx m_srcHeight; # endif - std::size_t m_dstPitchBytes; - std::size_t m_srcPitchBytes; + std::size_t m_dstRowPitchBytes; + std::size_t m_srcRowPitchBytes; void* m_dstMemNative; void const* m_srcMemNative; @@ -308,12 +308,10 @@ namespace alpaka , m_dstDepth(static_cast(getDepth(viewDst))) , m_srcDepth(static_cast(getDepth(viewSrc))) # endif - , m_dstPitchBytesX(static_cast(getPitchesInBytes(viewDst)[Dim::value - 1u])) - , m_srcPitchBytesX(static_cast(getPitchesInBytes(viewSrc)[Dim::value - 1u])) - , m_dstPitchBytesY(static_cast( - getPitchesInBytes(viewDst)[Dim::value - (2u % Dim::value)])) - , m_srcPitchBytesY(static_cast( - getPitchesInBytes(viewSrc)[Dim::value - (2u % Dim::value)])) + , m_dstRowPitchBytes(static_cast(getPitchesInBytes(viewDst)[1])) + , m_srcRowPitchBytes(static_cast(getPitchesInBytes(viewSrc)[1])) + , m_dstSlicePitchBytes(static_cast(getPitchesInBytes(viewDst)[0])) + , m_srcSlicePitchBytes(static_cast(getPitchesInBytes(viewSrc)[0])) , m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) { @@ -324,8 +322,8 @@ namespace alpaka ALPAKA_ASSERT(m_extentWidth <= m_srcWidth); ALPAKA_ASSERT(m_extentHeight <= m_srcHeight); ALPAKA_ASSERT(m_extentDepth <= m_srcDepth); - ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes); - ALPAKA_ASSERT(m_extentWidthBytes <= m_srcPitchBytes); + ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes); + ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes); # endif } @@ -359,21 +357,17 @@ namespace alpaka ALPAKA_DEBUG_FULL_LOG_SCOPE; // Fill CUDA/HIP parameter structure. - typename TApi::Memcpy3DParms_t memCpy3DParms; - memCpy3DParms.srcArray = nullptr; // Either srcArray or srcPtr. - memCpy3DParms.srcPos = TApi::makePos(0, 0, 0); // Optional. Offset in bytes. + typename TApi::Memcpy3DParms_t memCpy3DParms{}; // zero-init required per CUDA documentation memCpy3DParms.srcPtr = TApi::makePitchedPtr( const_cast(m_srcMemNative), - m_srcPitchBytesX, + m_srcRowPitchBytes, static_cast(m_srcWidth), - m_srcPitchBytesY / m_srcPitchBytesX); - memCpy3DParms.dstArray = nullptr; // Either dstArray or dstPtr. - memCpy3DParms.dstPos = TApi::makePos(0, 0, 0); // Optional. Offset in bytes. + m_srcSlicePitchBytes / m_srcRowPitchBytes); memCpy3DParms.dstPtr = TApi::makePitchedPtr( m_dstMemNative, - m_dstPitchBytesX, + m_dstRowPitchBytes, static_cast(m_dstWidth), - m_dstPitchBytesY / m_dstPitchBytesX); + m_dstSlicePitchBytes / m_dstRowPitchBytes); memCpy3DParms.extent = TApi::makeExtent( m_extentWidthBytes, static_cast(m_extentHeight), @@ -388,10 +382,11 @@ namespace alpaka std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight << " ed: " << m_extentDepth << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice << " dw: " << m_dstWidth << " dh: " << m_dstHeight << " dd: " << m_dstDepth - << " dptr: " << m_dstMemNative << " dpitchb: " << m_dstPitchBytes - << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight - << " sd: " << m_srcDepth << " sptr: " << m_srcMemNative << " spitchb: " << m_srcPitchBytes - << std::endl; + << " dptr: " << m_dstMemNative << " drowpitch: " << m_dstRowPitchBytes + << " dslicepitch: " << m_dstSlicePitchBytes << " sdev: " << m_iSrcDevice + << " sw: " << m_srcWidth << " sh: " << m_srcHeight << " sd: " << m_srcDepth + << " sptr: " << m_srcMemNative << " srowpitch: " << m_srcRowPitchBytes + << " sslicepitch: " << m_srcSlicePitchBytes << std::endl; } # endif typename TApi::MemcpyKind_t m_uniformMemCpyKind; @@ -411,10 +406,10 @@ namespace alpaka Idx m_dstDepth; Idx m_srcDepth; # endif - std::size_t m_dstPitchBytesX; - std::size_t m_srcPitchBytesX; - std::size_t m_dstPitchBytesY; - std::size_t m_srcPitchBytesY; + std::size_t m_dstRowPitchBytes; + std::size_t m_srcRowPitchBytes; + std::size_t m_dstSlicePitchBytes; + std::size_t m_srcSlicePitchBytes; void* m_dstMemNative; void const* m_srcMemNative; diff --git a/include/alpaka/mem/buf/uniformCudaHip/Set.hpp b/include/alpaka/mem/buf/uniformCudaHip/Set.hpp index b354eaf000d2..3b6551c44614 100644 --- a/include/alpaka/mem/buf/uniformCudaHip/Set.hpp +++ b/include/alpaka/mem/buf/uniformCudaHip/Set.hpp @@ -149,7 +149,7 @@ namespace alpaka auto const dstWidth = getWidth(view); auto const dstHeight = getHeight(view); # endif - auto const dstPitchBytes = static_cast(getPitchesInBytes(view)[Dim::value - 1u]); + auto const dstRowPitchBytes = static_cast(getPitchesInBytes(view)[0]); auto const dstNativePtr = reinterpret_cast(getPtrNative(view)); ALPAKA_ASSERT(extentWidth <= dstWidth); ALPAKA_ASSERT(extentHeight <= dstHeight); @@ -157,7 +157,7 @@ namespace alpaka // Initiate the memory set. ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memset2DAsync( dstNativePtr, - dstPitchBytes, + dstRowPitchBytes, static_cast(this->m_byte), extentWidthBytes, static_cast(extentHeight), @@ -202,9 +202,7 @@ namespace alpaka auto const dstHeight = getHeight(view); auto const dstDepth = getDepth(view); # endif - auto const dstPitchBytesX = static_cast(getPitchesInBytes(view)[Dim::value - 1u]); - auto const dstPitchBytesY - = static_cast(getPitchesInBytes(view)[Dim::value - (2u % Dim::value)]); + auto const [dstSlicePitchBytes, dstRowPitchBytes, _] = getPitchesInBytes(view); auto const dstNativePtr = reinterpret_cast(getPtrNative(view)); ALPAKA_ASSERT(extentWidth <= dstWidth); ALPAKA_ASSERT(extentHeight <= dstHeight); @@ -213,9 +211,9 @@ namespace alpaka // Fill CUDA parameter structures. typename TApi::PitchedPtr_t const pitchedPtrVal = TApi::makePitchedPtr( dstNativePtr, - dstPitchBytesX, + static_cast(dstRowPitchBytes), static_cast(dstWidth) * sizeof(Elem), - dstPitchBytesY / dstPitchBytesX); + static_cast(dstSlicePitchBytes / dstRowPitchBytes)); typename TApi::Extent_t const extentVal = TApi::makeExtent( static_cast(extentWidth) * sizeof(Elem), diff --git a/include/alpaka/mem/view/Traits.hpp b/include/alpaka/mem/view/Traits.hpp index 47ee11785858..091799e832c3 100644 --- a/include/alpaka/mem/view/Traits.hpp +++ b/include/alpaka/mem/view/Traits.hpp @@ -14,6 +14,7 @@ #include "alpaka/meta/Integral.hpp" #include "alpaka/offset/Traits.hpp" #include "alpaka/queue/Traits.hpp" +#include "alpaka/vec/Traits.hpp" #include "alpaka/vec/Vec.hpp" #include @@ -34,14 +35,12 @@ namespace alpaka ALPAKA_FN_HOST_ACC constexpr inline auto calculatePitchesFromExtents(Vec const& extent) { Vec pitchBytes{}; - if constexpr(TDim::value > 0) - { - pitchBytes[TDim::value - 1u] = extent[TDim::value - 1u] * static_cast(sizeof(TElem)); - for(TIdx i = TDim::value - 1u; i > static_cast(0u); --i) - { - pitchBytes[i - 1] = extent[i - 1] * pitchBytes[i]; - } - } + constexpr auto dim = TIdx{TDim::value}; + if constexpr(dim > 0) + pitchBytes.back() = static_cast(sizeof(TElem)); + if constexpr(dim > 1) + for(TIdx i = TDim::value - 1; i > 0; i--) + pitchBytes[i - 1] = extent[i] * pitchBytes[i]; return pitchBytes; } } // namespace detail @@ -63,17 +62,17 @@ namespace alpaka //! //! The default implementation uses the extent to calculate the pitch. template - struct [[deprecated]] GetPitchBytes + struct [[deprecated("Use GetPitchesInBytes instead")]] GetPitchBytes { using ViewIdx = Idx; - ALPAKA_FN_HOST static auto getPitchBytes(TView const& view) -> ViewIdx + ALPAKA_FN_HOST static auto getPitchBytes(TView const& view)->ViewIdx { return getPitchBytesDefault(view); } private: - static auto getPitchBytesDefault(TView const& view) -> ViewIdx + static auto getPitchBytesDefault(TView const& view)->ViewIdx { constexpr auto idx = TIdx::value; constexpr auto viewDim = Dim::value; @@ -96,10 +95,7 @@ namespace alpaka } }; - //! The pitches in bytes. - //! This is the distance in bytes in the linear memory between two consecutive elements in the next higher - //! dimension (TIdx-1). - //! + //! Customization point for \ref getPitchesInBytes. //! The default implementation uses the extent to calculate the pitches. template struct GetPitchesInBytes @@ -180,7 +176,7 @@ namespace alpaka //! \return The pitch in bytes. This is the distance in bytes between two consecutive elements in the given //! dimension. template - [[deprecated]] ALPAKA_FN_HOST auto getPitchBytes(TView const& view) -> Idx + [[deprecated("Use getPitchesInBytes instead")]] ALPAKA_FN_HOST auto getPitchBytes(TView const& view) -> Idx { #if BOOST_COMP_CLANG || BOOST_COMP_GNUC # pragma GCC diagnostic push @@ -192,8 +188,13 @@ namespace alpaka #endif } - //! \return The pitches in bytes. This is the distance in bytes between two consecutive elements in the given - //! dimension. + //! \return The pitches in bytes as an alpaka::Vec. This is the distance in bytes between two consecutive elements + //! in the given dimension. + //! E.g. for a 3D view without padding, the 0-dim pitch is the distance in bytes to jump from one element to the + //! next within the same row, the 1-dim pitch (aka. the row pitch) is the distance in bytes to jump from one + //! element to the neighboring element on the next row. The 2-dim pitch (aka. the slice pitch) is the distance in + //! bytes to jump from one element to the neighboring element on the next slice. + //! E.g. a 3D view of floats without padding and the extents {42, 10, 2}, would have a pitch vector of {80, 8, 4}. template ALPAKA_FN_HOST auto getPitchesInBytes(TView const& view) -> Vec, Idx> { @@ -419,13 +420,7 @@ namespace alpaka template ALPAKA_FN_HOST auto getPitchBytesVecEnd(TView const& view = TView()) -> Vec> { - static_assert(TDim::value <= Dim::value, "Cannot get more items than the pitch vector holds"); - - auto const p = getPitchesInBytes(view); - Vec> v; - for(unsigned i = 0; i < TDim::value; i++) - v[i] = p[(Dim::value - TDim::value) + i]; - return v; + return subVecEnd(getPitchesInBytes(view)); } //! \return A view to static device memory. @@ -567,15 +562,6 @@ namespace alpaka auto const ex = getExtents(view); return std::experimental::dextents, Dim::value>{ex[Is]...}; } - - template - ALPAKA_FN_HOST auto makeStrides(Vec const& pitches, std::index_sequence) - { - // alpaka pitches are right-shifted by 1. We skip getPitchBytes<0> (the size in bytes of the entire - // buffer) and append the element size last - return std::array{ - (Is < TDim::value - 1 ? pitches[Is + 1] : static_cast(sizeof(TElem)))...}; - } } // namespace detail //! Customization point for getting an mdspan from a view. @@ -588,8 +574,7 @@ namespace alpaka using Element = Elem; auto extents = detail::makeExtents(view, std::make_index_sequence{}); auto* ptr = reinterpret_cast(getPtrNative(view)); - auto const strides - = detail::makeStrides(getPitchesInBytes(view), std::make_index_sequence{}); + auto const strides = toArray(getPitchesInBytes(view)); layout_stride::mapping m{extents, strides}; return mdspan>{ ptr, @@ -602,8 +587,7 @@ namespace alpaka using Element = Elem; auto extents = detail::makeExtents(view, std::make_index_sequence{}); auto* ptr = reinterpret_cast(getPtrNative(view)); - auto strides - = detail::makeStrides(getPitchesInBytes(view), std::make_index_sequence{}); + auto strides = toArray(getPitchesInBytes(view)); std::reverse(begin(strides), end(strides)); layout_stride::mapping m{extents, strides}; return mdspan>{ diff --git a/include/alpaka/mem/view/ViewAccessOps.hpp b/include/alpaka/mem/view/ViewAccessOps.hpp index 00e25bc4c9c8..31b7a16e0dca 100644 --- a/include/alpaka/mem/view/ViewAccessOps.hpp +++ b/include/alpaka/mem/view/ViewAccessOps.hpp @@ -98,15 +98,7 @@ namespace alpaka::internal auto ptr = reinterpret_cast(data()); if constexpr(Dim::value > 0) - { - auto const pitchesInBytes = getPitchesInBytes(*static_cast(this)); - for(std::size_t i = 0u; i < Dim::value; i++) - { - Idx const pitch - = i + 1 < Dim::value ? pitchesInBytes[i + 1] : static_cast(sizeof(value_type)); - ptr += static_cast(index[i] * pitch); - } - } + ptr += (getPitchesInBytes(*static_cast(this)) * castVec(index)).sum(); return reinterpret_cast(ptr); } diff --git a/include/alpaka/mem/view/ViewSubView.hpp b/include/alpaka/mem/view/ViewSubView.hpp index ef784cf58622..514d76143246 100644 --- a/include/alpaka/mem/view/ViewSubView.hpp +++ b/include/alpaka/mem/view/ViewSubView.hpp @@ -34,62 +34,26 @@ namespace alpaka //! \param view The view this view is a sub-view of. //! \param extentElements The extent in elements. //! \param relativeOffsetsElements The offsets in elements. - template + template ViewSubView( - TView const& view, + TQualifiedView& view, TExtent const& extentElements, TOffsets const& relativeOffsetsElements = TOffsets()) : m_viewParentView(getPtrNative(view), getDev(view), getExtents(view), getPitchesInBytes(view)) , m_extentElements(getExtents(extentElements)) - , m_offsetsElements(getOffsetVec(relativeOffsetsElements)) + , m_offsetsElements(getOffsets(relativeOffsetsElements)) + , m_nativePtr(computeNativePtr()) { ALPAKA_DEBUG_FULL_LOG_SCOPE; - static_assert( - std::is_same_v>, - "The dev type of TView and the Dev template parameter have to be identical!"); + using View = std::remove_cv_t; static_assert( - std::is_same_v>, - "The idx type of TView and the TIdx template parameter have to be identical!"); - static_assert( - std::is_same_v>, - "The idx type of TExtent and the TIdx template parameter have to be identical!"); - static_assert( - std::is_same_v>, - "The idx type of TOffsets and the TIdx template parameter have to be identical!"); - - static_assert( - std::is_same_v>, - "The dim type of TView and the TDim template parameter have to be identical!"); - static_assert( - std::is_same_v>, - "The dim type of TExtent and the TDim template parameter have to be identical!"); - static_assert( - std::is_same_v>, - "The dim type of TOffsets and the TDim template parameter have to be identical!"); - - ALPAKA_ASSERT( - ((m_offsetsElements + m_extentElements) <= getExtents(view)).foldrAll(std::logical_and(), true)); - } - //! Constructor. - //! \param view The view this view is a sub-view of. - //! \param extentElements The extent in elements. - //! \param relativeOffsetsElements The offsets in elements. - template - ViewSubView(TView& view, TExtent const& extentElements, TOffsets const& relativeOffsetsElements = TOffsets()) - : m_viewParentView(getPtrNative(view), getDev(view), getExtents(view), getPitchesInBytes(view)) - , m_extentElements(getExtents(extentElements)) - , m_offsetsElements(getOffsetVec(relativeOffsetsElements)) - { - ALPAKA_DEBUG_FULL_LOG_SCOPE; - - static_assert( - std::is_same_v>, + std::is_same_v>, "The dev type of TView and the Dev template parameter have to be identical!"); static_assert( - std::is_same_v>, + std::is_same_v>, "The idx type of TView and the TIdx template parameter have to be identical!"); static_assert( std::is_same_v>, @@ -99,7 +63,7 @@ namespace alpaka "The idx type of TOffsets and the TIdx template parameter have to be identical!"); static_assert( - std::is_same_v>, + std::is_same_v>, "The dim type of TView and the TDim template parameter have to be identical!"); static_assert( std::is_same_v>, @@ -108,28 +72,43 @@ namespace alpaka std::is_same_v>, "The dim type of TOffsets and the TDim template parameter have to be identical!"); - ALPAKA_ASSERT( - ((m_offsetsElements + m_extentElements) <= getExtents(view)).foldrAll(std::logical_and(), true)); + ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= getExtents(view)).all()); } //! \param view The view this view is a sub-view of. template - explicit ViewSubView(TView const& view) : ViewSubView(view, view, Vec::all(0)) + explicit ViewSubView(TView const& view) : ViewSubView(view, getExtents(view), Vec::zeros()) { ALPAKA_DEBUG_FULL_LOG_SCOPE; } //! \param view The view this view is a sub-view of. template - explicit ViewSubView(TView& view) : ViewSubView(view, view, Vec::all(0)) + explicit ViewSubView(TView& view) : ViewSubView(view, getExtents(view), Vec::zeros()) { ALPAKA_DEBUG_FULL_LOG_SCOPE; } public: + ALPAKA_FN_HOST auto computeNativePtr() + { +#if BOOST_COMP_GNUC +# pragma GCC diagnostic push + // "cast from 'std::uint8_t*' to 'TElem*' increases required alignment of target type" +# pragma GCC diagnostic ignored "-Wcast-align" +#endif + return reinterpret_cast( + reinterpret_cast(alpaka::getPtrNative(m_viewParentView)) + + (m_offsetsElements * getPitchesInBytes(m_viewParentView)).sum()); +#if BOOST_COMP_GNUC +# pragma GCC diagnostic pop +#endif + } + ViewPlainPtr m_viewParentView; // This wraps the parent view. Vec m_extentElements; // The extent of this view. Vec m_offsetsElements; // The offset relative to the parent view. + TElem* m_nativePtr; }; // Trait specializations for ViewSubView. @@ -176,57 +155,19 @@ namespace alpaka } }; -#if BOOST_COMP_GNUC -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored \ - "-Wcast-align" // "cast from 'std::uint8_t*' to 'TElem*' increases required alignment of target type" -#endif //! The ViewSubView native pointer get trait specialization. template struct GetPtrNative> { - private: - using IdxSequence = std::make_index_sequence; - - public: ALPAKA_FN_HOST static auto getPtrNative(ViewSubView const& view) -> TElem const* { - // \TODO: pre-calculate this pointer for faster execution. - return reinterpret_cast( - reinterpret_cast(alpaka::getPtrNative(view.m_viewParentView)) - + pitchedOffsetBytes(view, IdxSequence())); + return view.m_nativePtr; } ALPAKA_FN_HOST static auto getPtrNative(ViewSubView& view) -> TElem* { - // \TODO: pre-calculate this pointer for faster execution. - return reinterpret_cast( - reinterpret_cast(alpaka::getPtrNative(view.m_viewParentView)) - + pitchedOffsetBytes(view, IdxSequence())); - } - - private: - //! For a 3D vector this calculates: - //! - //! getOffsets(view)[0] * getPitchBytes<1u>(view) - //! + getOffsets(view)[1] * getPitchBytes<2u>(view) - //! + getOffsets(view)[2] * getPitchBytes<3u>(view) - //! while getPitchBytes<3u>(view) is equivalent to sizeof(TElem) - template - ALPAKA_FN_HOST static auto pitchedOffsetBytes(TView const& view, std::index_sequence const&) - -> TIdx - { - auto const offsets = getOffsets(view); - auto const pitches = getPitchesInBytes(view); - return ( - (offsets[TIndices] - * (TIndices + 1 < Dim::value ? pitches[TIndices + 1] - : static_cast(sizeof(Elem)))) - + ... + TIdx{0}); // FIXME: see comment above + return view.m_nativePtr; } }; -#if BOOST_COMP_GNUC -# pragma GCC diagnostic pop -#endif //! The ViewSubView pitch get trait specialization. template @@ -234,7 +175,7 @@ namespace alpaka { ALPAKA_FN_HOST auto operator()(ViewSubView const& view) const { - return alpaka::getPitchesInBytes(view.m_viewParentView); + return getPitchesInBytes(view.m_viewParentView); } }; diff --git a/include/alpaka/test/mem/view/Iterator.hpp b/include/alpaka/test/mem/view/Iterator.hpp index 42295ff0de94..314d1c0c9d53 100644 --- a/include/alpaka/test/mem/view/Iterator.hpp +++ b/include/alpaka/test/mem/view/Iterator.hpp @@ -17,11 +17,6 @@ namespace alpaka::test template using MimicConst = std::conditional_t, std::add_const_t, std::remove_const_t>; -#if BOOST_COMP_GNUC -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored \ - "-Wcast-align" // "cast from 'Byte*' to 'Elem*' increases required alignment of target type" -#endif template class IteratorView { @@ -39,7 +34,7 @@ namespace alpaka::test { } - ALPAKA_FN_HOST IteratorView(TView& view) : IteratorView(view, 0) + ALPAKA_FN_HOST explicit IteratorView(TView& view) : IteratorView(view, 0) { } @@ -87,37 +82,19 @@ namespace alpaka::test return *m_nativePtr; else { - using Dim1 = DimInt<1>; - using DimMin1 = DimInt; - - Vec const currentIdxDim1{m_currentIdx}; - Vec const currentIdxDimx(mapIdx(currentIdxDim1, m_extents)); - - // [pz, py, px] -> [py, px] - auto const pitchWithoutOutermost = subVecEnd(m_pitchBytes); - // [ElemSize] - Vec const elementSizeVec = static_cast(sizeof(Elem)); - // [py, px] ++ [ElemSize] -> [py, px, ElemSize] - Vec const dstPitchBytes = concatVec(pitchWithoutOutermost, elementSizeVec); - // [py, px, ElemSize] [z, y, x] -> [py*z, px*y, ElemSize*x] - auto const dimensionalOffsetsInByte = currentIdxDimx * dstPitchBytes; - // sum{[py*z, px*y, ElemSize*x]} -> offset in byte - auto const offsetInByte = dimensionalOffsetsInByte.foldrAll(std::plus()); - - using Byte = MimicConst; - Byte* ptr(reinterpret_cast(m_nativePtr) + offsetInByte); - -#if 0 - std::cout - << " i1: " << currentIdxDim1 - << " in: " << currentIdxDimx - << " dpb: " << dstPitchBytes - << " offb: " << offsetInByte - << " ptr: " << reinterpret_cast(ptr) - << " v: " << *reinterpret_cast(ptr) - << std::endl; + Vec const currentIdxDimx + = mapIdx(Vec, Idx>{m_currentIdx}, m_extents); + auto const offsetInBytes = (currentIdxDimx * m_pitchBytes).sum(); + using QualifiedByte = MimicConst; +#if BOOST_COMP_GNUC +# pragma GCC diagnostic push + // "cast from 'Byte*' to 'Elem*' increases required alignment of target type" +# pragma GCC diagnostic ignored "-Wcast-align" +#endif + return *reinterpret_cast(reinterpret_cast(m_nativePtr) + offsetInBytes); +#if BOOST_COMP_GNUC +# pragma GCC diagnostic pop #endif - return *reinterpret_cast(ptr); } ALPAKA_UNREACHABLE(*m_nativePtr); } @@ -128,9 +105,6 @@ namespace alpaka::test Vec m_extents; Vec m_pitchBytes; }; -#if BOOST_COMP_GNUC -# pragma GCC diagnostic pop -#endif template struct Begin diff --git a/include/alpaka/test/mem/view/ViewTest.hpp b/include/alpaka/test/mem/view/ViewTest.hpp index 59f30411d685..82ab915f66cf 100644 --- a/include/alpaka/test/mem/view/ViewTest.hpp +++ b/include/alpaka/test/mem/view/ViewTest.hpp @@ -56,16 +56,7 @@ namespace alpaka::test // trait::GetPitchBytes { - // The pitches have to be at least as large as the values we calculate here. - auto pitchMinimum = Vec, TIdx>::ones(); - // Initialize the pitch between two elements of the X dimension ... - pitchMinimum[TDim::value] = sizeof(TElem); - // ... and fill all the other dimensions. - for(TIdx i = TDim::value; i > static_cast(0u); --i) - { - pitchMinimum[i - 1] = extent[i - 1] * pitchMinimum[i]; - } - + auto const pitchMinimum = alpaka::detail::calculatePitchesFromExtents(extent); auto const pitchView = getPitchesInBytes(view); for(TIdx i = TDim::value; i > static_cast(0u); --i) diff --git a/test/integ/mandelbrot/src/mandelbrot.cpp b/test/integ/mandelbrot/src/mandelbrot.cpp index a9812fbcf367..9cfeac4bdacf 100644 --- a/test/integ/mandelbrot/src/mandelbrot.cpp +++ b/test/integ/mandelbrot/src/mandelbrot.cpp @@ -14,7 +14,7 @@ #include #include -//#define ALPAKA_MANDELBROT_TEST_CONTINOUS_COLOR_MAPPING // Define this to enable the continuous color mapping. +// #define ALPAKA_MANDELBROT_TEST_CONTINOUS_COLOR_MAPPING // Define this to enable the continuous color mapping. //! Complex Number. template @@ -87,7 +87,7 @@ class MandelbrotKernel //! \param pColors The output image. //! \param numRows The number of rows in the image //! \param numCols The number of columns in the image. - //! \param pitchBytes The pitch in bytes. + //! \param rowPitchBytes The row pitch in bytes. //! \param fMinR The left border. //! \param fMaxR The right border. //! \param fMinI The bottom border. @@ -100,7 +100,7 @@ class MandelbrotKernel std::uint32_t* const pColors, std::uint32_t const& numRows, std::uint32_t const& numCols, - std::uint32_t const& pitchBytes, + std::uint32_t const& rowPitchBytes, float const& fMinR, float const& fMaxR, float const& fMinI, @@ -109,10 +109,7 @@ class MandelbrotKernel { static_assert(alpaka::Dim::value == 2, "The MandelbrotKernel expects 2-dimensional indices!"); - auto const gridThreadIdx = alpaka::getIdx(acc); - auto const& gridThreadIdxX = gridThreadIdx[1u]; - auto const& gridThreadIdxY = gridThreadIdx[0u]; - + auto const [gridThreadIdxY, gridThreadIdxX] = alpaka::getIdx(acc); if((gridThreadIdxY < numRows) && (gridThreadIdxX < numCols)) { SimpleComplex c( @@ -121,7 +118,7 @@ class MandelbrotKernel auto const iterationCount = iterateMandelbrot(c, maxIterations); - auto const pColorsRow = pColors + ((gridThreadIdxY * pitchBytes) / sizeof(std::uint32_t)); + auto const pColorsRow = pColors + ((gridThreadIdxY * rowPitchBytes) / sizeof(std::uint32_t)); pColorsRow[gridThreadIdxX] = #ifdef ALPAKA_MANDELBROT_TEST_CONTINOUS_COLOR_MAPPING iterationCountToContinousColor(iterationCount, maxIterations); @@ -203,8 +200,8 @@ auto writeTgaColorImage(std::string const& fileName, TBuf const& bufRgba) -> voi ALPAKA_ASSERT(bufWidthColors >= 1); auto const bufHeightColors = alpaka::getHeight(bufRgba); ALPAKA_ASSERT(bufHeightColors >= 1); - auto const bufPitchBytes = alpaka::getPitchesInBytes(bufRgba)[alpaka::Dim::value - 1u]; - ALPAKA_ASSERT(bufPitchBytes >= bufWidthBytes); + auto const bufRowPitchBytes = alpaka::getPitchesInBytes(bufRgba)[0]; + ALPAKA_ASSERT(bufRowPitchBytes >= bufWidthBytes); std::ofstream ofs(fileName, std::ofstream::out | std::ofstream::binary); if(!ofs.is_open()) @@ -235,7 +232,7 @@ auto writeTgaColorImage(std::string const& fileName, TBuf const& bufRgba) -> voi // Write the data. char const* pData(reinterpret_cast(alpaka::getPtrNative(bufRgba))); // If there is no padding, we can directly write the whole buffer data ... - if(bufPitchBytes == bufWidthBytes) + if(bufRowPitchBytes == bufWidthBytes) { ofs.write(pData, static_cast(bufWidthBytes * bufHeightColors)); } @@ -244,7 +241,7 @@ auto writeTgaColorImage(std::string const& fileName, TBuf const& bufRgba) -> voi { for(auto row = decltype(bufHeightColors)(0); row < bufHeightColors; ++row) { - ofs.write(pData + bufPitchBytes * row, static_cast(bufWidthBytes)); + ofs.write(pData + bufRowPitchBytes * row, static_cast(bufWidthBytes)); } } } @@ -315,13 +312,15 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs) alpaka::memcpy(queue, bufColorAcc, bufColorHost); // Create the kernel execution task. + auto const [rowPitch, _] = alpaka::getPitchesInBytes(bufColorAcc); + CHECK(rowPitch % sizeof(Val) == 0); auto const taskKernel = alpaka::createTaskKernel( workDiv, kernel, alpaka::getPtrNative(bufColorAcc), numRows, numCols, - alpaka::getPitchesInBytes(bufColorAcc)[1], + rowPitch, fMinR, fMaxR, fMinI, diff --git a/test/integ/matMul/src/matMul.cpp b/test/integ/matMul/src/matMul.cpp index b6d8c2dfb695..bf45822e9b69 100644 --- a/test/integ/matMul/src/matMul.cpp +++ b/test/integ/matMul/src/matMul.cpp @@ -83,7 +83,7 @@ class MatMulKernel for(TIndex k2(0u); k2 < blockMulCount; ++k2) { // Copy the current blocks of A and B into shared memory in parallel. - // If the element of the current thread is outside of the matrix, zero is written into the shared memory. + // If the element of the current thread is outside the matrix, zero is written into the shared memory. // This is possible because zero is a result neutral extension of the matrices regarding the dot product. auto const AIdxX = k2 * blockThreadExtentX + blockThreadIdxX; auto const AIdx1d = gridThreadIdxY * lda + AIdxX; @@ -240,14 +240,22 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) << alpaka::test::integ::measureRunTimeMs([&] { alpaka::memcpy(queueAcc, bufCAcc, bufCHost); }) << " ms" << std::endl; - auto const pitchA = alpaka::getPitchesInBytes(bufAAcc)[1]; - auto const pitchB = alpaka::getPitchesInBytes(bufBAcc)[1]; - auto const pitchC = alpaka::getPitchesInBytes(bufCAcc)[1]; + auto const rowPitchA = alpaka::getPitchesInBytes(bufAAcc)[0]; + auto const rowPitchB = alpaka::getPitchesInBytes(bufBAcc)[0]; + auto const rowPitchC = alpaka::getPitchesInBytes(bufCAcc)[0]; - // Assumptions we make - REQUIRE(pitchA % sizeof(Val) == 0); - REQUIRE(pitchB % sizeof(Val) == 0); - REQUIRE(pitchC % sizeof(Val) == 0); + // We assume that the row pitches are divisible by the element size + REQUIRE(rowPitchA % sizeof(Val) == 0); + REQUIRE(rowPitchB % sizeof(Val) == 0); + REQUIRE(rowPitchC % sizeof(Val) == 0); + + auto const lda = static_cast(rowPitchA / sizeof(Val)); + auto const ldb = static_cast(rowPitchB / sizeof(Val)); + auto const ldc = static_cast(rowPitchC / sizeof(Val)); + + std::cout << "pitchesA " << alpaka::getPitchesInBytes(bufAAcc) << " lda: " << lda << "\n"; + std::cout << "pitchesB " << alpaka::getPitchesInBytes(bufBAcc) << " ldb: " << ldb << "\n"; + std::cout << "pitchesC " << alpaka::getPitchesInBytes(bufCAcc) << " ldc: " << ldc << "\n"; // Create the kernel execution task. auto const taskKernel = alpaka::createTaskKernel( @@ -258,12 +266,12 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) k, static_cast(1), alpaka::getPtrNative(bufAAcc), - static_cast(pitchA / sizeof(Val)), + lda, alpaka::getPtrNative(bufBAcc), - static_cast(pitchB / sizeof(Val)), + ldb, static_cast(1), alpaka::getPtrNative(bufCAcc), - static_cast(pitchC / sizeof(Val))); + ldc); // Profile the kernel execution. std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queueAcc, taskKernel) << " ms" diff --git a/test/unit/mem/buf/src/BufTest.cpp b/test/unit/mem/buf/src/BufTest.cpp index 816dee6b3235..ee563dcd29d3 100644 --- a/test/unit/mem/buf/src/BufTest.cpp +++ b/test/unit/mem/buf/src/BufTest.cpp @@ -212,32 +212,16 @@ static auto testBufferAccessorAdaptor( auto const platformAcc = alpaka::Platform{}; auto const dev = alpaka::getDevByIdx(platformAcc, 0); - // alpaka::malloc auto buf = alpaka::allocBuf(dev, extent); // check that the array subscript operator access the correct element auto const& pitch = alpaka::getPitchesInBytes(buf); INFO("buffer extent: " << extent << " elements"); INFO("buffer pitch: " << pitch << " bytes"); - CHECK((index < extent).foldrAll(std::logical_and(), true)); - - auto base = reinterpret_cast(std::data(buf)); - uintptr_t expected = base; - if constexpr(Dim::value > 1) - { - expected += static_cast(pitch[1] * index[0]); - } - if constexpr(Dim::value > 2) - { - expected += static_cast(pitch[2] * index[1]); - } - if constexpr(Dim::value > 3) - { - expected += static_cast(pitch[3] * index[2]); - } - if constexpr(Dim::value > 0) - expected += sizeof(Elem) * static_cast(index[Dim::value - 1]); + CHECK((index < extent).all()); + auto const base = reinterpret_cast(std::data(buf)); + auto const expected = base + static_cast((pitch * index).sum()); INFO("element " << index << " expected at offset " << expected - base); INFO("element " << index << " returned at offset " << reinterpret_cast(&buf[index]) - base); CHECK(reinterpret_cast(expected) == &buf[index]); diff --git a/test/unit/mem/view/src/ViewSubViewTest.cpp b/test/unit/mem/view/src/ViewSubViewTest.cpp index 22afbb1f98af..02e7267b529a 100644 --- a/test/unit/mem/view/src/ViewSubViewTest.cpp +++ b/test/unit/mem/view/src/ViewSubViewTest.cpp @@ -36,14 +36,10 @@ namespace alpaka::test // alpaka::trait::GetPitchesInBytes // The pitch of the view has to be identical to the pitch of the underlying buffer in all dimensions. + auto const pitchBuf = alpaka::getPitchesInBytes(buf); { - auto const pitchBuf = alpaka::getPitchesInBytes(buf); auto const pitchView = alpaka::getPitchesInBytes(view); - - for(TIdx i = TDim::value; i > static_cast(0u); --i) - { - REQUIRE(pitchBuf[i - static_cast(1u)] == pitchView[i - static_cast(1u)]); - } + CHECK(pitchBuf == pitchView); } // alpaka::trait::GetPtrNative @@ -51,15 +47,7 @@ namespace alpaka::test { auto viewPtrNative = reinterpret_cast(alpaka::getPtrNative(buf)); if constexpr(TDim::value > 0) - { - auto const pitchBuf = alpaka::getPitchesInBytes(buf); - for(TIdx i = TDim::value; i > static_cast(0u); --i) - { - auto const pitch - = (i < static_cast(TDim::value)) ? pitchBuf[i] : static_cast(sizeof(TElem)); - viewPtrNative += offsetView[i - static_cast(1u)] * pitch; - } - } + viewPtrNative += (offsetView * pitchBuf).sum(); REQUIRE(reinterpret_cast(viewPtrNative) == alpaka::getPtrNative(view)); } } @@ -177,10 +165,10 @@ TEST_CASE("viewSubViewExample", "[memView]") auto checkBufContent = [&](std::vector> const& data) { for(std::size_t row = 0; row < 4; row++) - for(std::size_t j = 0; j < 5; j++) + for(std::size_t col = 0; col < 5; col++) { - CAPTURE(row, j); - CHECK(buf[Vec{static_cast(row), static_cast(j)}] == data[row][j]); + CAPTURE(row, col); + CHECK(buf[Vec{static_cast(row), static_cast(col)}] == data[row][col]); } }; @@ -230,3 +218,10 @@ TEST_CASE("viewSubViewExample", "[memView]") checkBufContent({{1, 1, 3, 1, 1}, {1, 5, 4, 4, 1}, {2, 5, 6, 6, 2}, {1, 1, 1, 1, 1}}); } } + +TEST_CASE("calculatePitchesFromExtents", "[memView]") +{ + CHECK((alpaka::detail::calculatePitchesFromExtents(alpaka::Vec{1, 1, 1}) == alpaka::Vec{4, 4, 4})); + CHECK((alpaka::detail::calculatePitchesFromExtents(alpaka::Vec{2, 2, 2}) == alpaka::Vec{16, 8, 4})); + CHECK((alpaka::detail::calculatePitchesFromExtents(alpaka::Vec{42, 10, 2}) == alpaka::Vec{80, 8, 4})); +}