From 14caad106cfedd7989f9410cb7c5ed6b9dde69ee Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 22 Aug 2023 11:59:43 +0200 Subject: [PATCH] Redefine pitches This PR shifts the values returned from getPitchesInBytes to be consistent with std::mdspan (except in bytes). Example: the pitch vector for the extent {42, 10, 2} changes: Before: {4, 3360, 80, 8} After: {80, 8, 4} The new meaning is that the pitch value is the number of bytes to jump from one element to the next in the given dimension. Fixes: #2083 --- example/bufferCopy/src/bufferCopy.cpp | 8 +-- example/randomCells2D/src/randomCells2D.cpp | 12 ++-- include/alpaka/idx/MapIdx.hpp | 15 ++-- include/alpaka/mem/buf/cpu/Copy.hpp | 19 ++--- include/alpaka/mem/buf/cpu/Set.hpp | 12 ++-- .../alpaka/mem/buf/uniformCudaHip/Copy.hpp | 57 ++++++++------- include/alpaka/mem/view/Traits.hpp | 32 ++++----- include/alpaka/mem/view/ViewSubView.hpp | 69 ++++++++----------- include/alpaka/test/mem/view/Iterator.hpp | 52 ++++---------- include/alpaka/test/mem/view/ViewTest.hpp | 11 +-- test/integ/mandelbrot/src/mandelbrot.cpp | 4 +- test/integ/matMul/src/matMul.cpp | 11 +-- test/unit/mem/view/src/ViewSubViewTest.cpp | 25 +++---- 13 files changed, 127 insertions(+), 200 deletions(-) diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp index 692cfa83e4d4..f56fd21aeeb1 100644 --- a/example/bufferCopy/src/bufferCopy.cpp +++ b/example/bufferCopy/src/bufferCopy.cpp @@ -215,10 +215,10 @@ auto main() -> int // padding between rows/planes of multidimensional memory allocations. // Therefore the pitch (distance between consecutive rows/planes) may be // greater than the space required for the data. - Idx const deviceBuffer1Pitch(alpaka::getPitchesInBytes(deviceBuffer1)[2] / sizeof(Data)); - Idx const deviceBuffer2Pitch(alpaka::getPitchesInBytes(deviceBuffer2)[2] / sizeof(Data)); - Idx const hostBuffer1Pitch(alpaka::getPitchesInBytes(hostBuffer)[2] / sizeof(Data)); - Idx const hostViewPlainPtrPitch(alpaka::getPitchesInBytes(hostViewPlainPtr)[2] / sizeof(Data)); + Idx const deviceBuffer1Pitch(alpaka::getPitchesInBytes(deviceBuffer1)[1] / sizeof(Data)); + Idx const deviceBuffer2Pitch(alpaka::getPitchesInBytes(deviceBuffer2)[1] / sizeof(Data)); + Idx const hostBuffer1Pitch(alpaka::getPitchesInBytes(hostBuffer)[1] / sizeof(Data)); + Idx const hostViewPlainPtrPitch(alpaka::getPitchesInBytes(hostViewPlainPtr)[1] / sizeof(Data)); // Test device Buffer // diff --git a/example/randomCells2D/src/randomCells2D.cpp b/example/randomCells2D/src/randomCells2D.cpp index 04d27b888ae8..4db4475adf1b 100644 --- a/example/randomCells2D/src/randomCells2D.cpp +++ b/example/randomCells2D/src/randomCells2D.cpp @@ -201,16 +201,16 @@ auto main() -> int RandomEngineVector* const ptrBufAccRandV{alpaka::getPtrNative(bufAccRandV)}; InitRandomKernel initRandomKernel; - auto pitchBufAccRandS = alpaka::getPitchesInBytes(bufAccRandS)[1]; + auto pitchBufAccRandS = alpaka::getPitchesInBytes(bufAccRandS)[0]; alpaka::exec(queue, workdiv, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS); alpaka::wait(queue); - auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[1]; + auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[0]; alpaka::exec(queue, workdiv, initRandomKernel, extent, ptrBufAccRandV, pitchBufAccRandV); alpaka::wait(queue); - auto pitchHostS = alpaka::getPitchesInBytes(bufHostS)[1]; - auto pitchHostV = alpaka::getPitchesInBytes(bufHostV)[1]; + auto pitchHostS = alpaka::getPitchesInBytes(bufHostS)[0]; + auto pitchHostV = alpaka::getPitchesInBytes(bufHostV)[0]; for(Idx y = 0; y < numY; ++y) { @@ -221,7 +221,7 @@ auto main() -> int } } - auto pitchBufAccS = alpaka::getPitchesInBytes(bufAccS)[1]; + auto pitchBufAccS = alpaka::getPitchesInBytes(bufAccS)[0]; alpaka::memcpy(queue, bufAccS, bufHostS); RunTimestepKernelSingle runTimestepKernelSingle; alpaka::exec( @@ -235,7 +235,7 @@ auto main() -> int pitchBufAccS); alpaka::memcpy(queue, bufHostS, bufAccS); - auto pitchBufAccV = alpaka::getPitchesInBytes(bufAccV)[1]; + auto pitchBufAccV = alpaka::getPitchesInBytes(bufAccV)[0]; alpaka::memcpy(queue, bufAccV, bufHostV); RunTimestepKernelVector runTimestepKernelVector; alpaka::exec( diff --git a/include/alpaka/idx/MapIdx.hpp b/include/alpaka/idx/MapIdx.hpp index 46f86458625f..d55910b3c72b 100644 --- a/include/alpaka/idx/MapIdx.hpp +++ b/include/alpaka/idx/MapIdx.hpp @@ -136,10 +136,11 @@ namespace alpaka namespace detail { - //! Maps a linear index to a N dimensional index assuming a buffer wihtout padding. + //! Maps a linear index to a N dimensional index assuming a buffer without padding. template struct MapIdxPitchBytes; - //! Maps a N dimensional index to the same N dimensional index assuming a buffer wihtout padding. + + //! Maps a N dimensional index to the same N dimensional index assuming a buffer without padding. template struct MapIdxPitchBytes { @@ -156,7 +157,7 @@ namespace alpaka return idx; } }; - //! Maps a 1 dimensional index to a N dimensional index assuming a buffer wihtout padding. + //! Maps a 1 dimensional index to a N dimensional index assuming a buffer without padding. template struct MapIdxPitchBytes 1u)>> { @@ -177,8 +178,8 @@ namespace alpaka TElem tmp = idx[0u]; for(std::size_t d(0u); d < lastIdx; ++d) { - idxNd[d] = static_cast(tmp / pitch[d + 1]); - tmp %= pitch[d + 1]; + idxNd[d] = static_cast(tmp / pitch[d]); + tmp %= pitch[d]; } idxNd[lastIdx] = tmp; @@ -203,7 +204,7 @@ namespace alpaka TElem idx1d = idx[lastDim]; for(std::size_t d(0u); d < lastDim; ++d) { - idx1d = static_cast(idx1d + pitch[d + 1] * idx[d]); + idx1d = static_cast(idx1d + pitch[d] * idx[d]); } return {idx1d}; } @@ -234,7 +235,7 @@ namespace alpaka }; } // namespace detail - //! Maps a N dimensional index to a N dimensional position based on + //! Maps an N dimensional index to a N dimensional position based on //! pitch in a buffer without padding or a byte buffer. //! //! \tparam TidxDimOut Dimension of the index vector to map to. diff --git a/include/alpaka/mem/buf/cpu/Copy.hpp b/include/alpaka/mem/buf/cpu/Copy.hpp index 098c9b406049..391b9885799f 100644 --- a/include/alpaka/mem/buf/cpu/Copy.hpp +++ b/include/alpaka/mem/buf/cpu/Copy.hpp @@ -51,8 +51,6 @@ namespace alpaka { ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).foldrAll(std::logical_or())); ALPAKA_ASSERT((castVec(m_extent) <= m_srcExtent).foldrAll(std::logical_or())); - ALPAKA_ASSERT(static_cast(m_extentWidthBytes) <= m_dstPitchBytes[TDim::value - 1u]); - ALPAKA_ASSERT(static_cast(m_extentWidthBytes) <= m_srcPitchBytes[TDim::value - 1u]); } } @@ -100,10 +98,9 @@ namespace alpaka #endif // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one // iteration. - Vec const extentWithoutInnermost(subVecBegin(this->m_extent)); - // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required. - Vec const dstPitchBytesWithoutOutmost(subVecEnd(this->m_dstPitchBytes)); - Vec const srcPitchBytesWithoutOutmost(subVecEnd(this->m_srcPitchBytes)); + Vec const extentWithoutInnermost = subVecBegin(this->m_extent); + Vec const dstPitchBytesWithoutOutmost = subVecBegin(this->m_dstPitchBytes); + Vec const srcPitchBytesWithoutOutmost = subVecBegin(this->m_srcPitchBytes); if(static_cast(this->m_extent.prod()) != 0u) { @@ -112,14 +109,8 @@ namespace alpaka [&](Vec const& idx) { std::memcpy( - reinterpret_cast( - this->m_dstMemNative - + (castVec(idx) * dstPitchBytesWithoutOutmost) - .foldrAll(std::plus())), - reinterpret_cast( - this->m_srcMemNative - + (castVec(idx) * srcPitchBytesWithoutOutmost) - .foldrAll(std::plus())), + this->m_dstMemNative + (castVec(idx) * dstPitchBytesWithoutOutmost).sum(), + this->m_srcMemNative + (castVec(idx) * srcPitchBytesWithoutOutmost).sum(), static_cast(this->m_extentWidthBytes)); }); } diff --git a/include/alpaka/mem/buf/cpu/Set.hpp b/include/alpaka/mem/buf/cpu/Set.hpp index b75dd4135def..54c0f87df2ba 100644 --- a/include/alpaka/mem/buf/cpu/Set.hpp +++ b/include/alpaka/mem/buf/cpu/Set.hpp @@ -39,7 +39,7 @@ namespace alpaka , m_dstMemNative(reinterpret_cast(getPtrNative(view))) { ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).foldrAll(std::logical_or())); - ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]); + // ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]); } #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL @@ -80,9 +80,8 @@ namespace alpaka #endif // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one // iteration. - Vec const extentWithoutInnermost(subVecBegin(this->m_extent)); - // [z, y, x] -> [y, x] because the z pitch (the full idx of the buffer) is not required. - Vec const dstPitchBytesWithoutOutmost(subVecEnd(this->m_dstPitchBytes)); + Vec const extentWithoutInnermost = subVecBegin(this->m_extent); + Vec const dstPitchBytesWithoutOutmost = subVecBegin(this->m_dstPitchBytes); if(static_cast(this->m_extent.prod()) != 0u) { @@ -91,10 +90,7 @@ namespace alpaka [&](Vec const& idx) { std::memset( - reinterpret_cast( - this->m_dstMemNative - + (castVec(idx) * dstPitchBytesWithoutOutmost) - .foldrAll(std::plus())), + this->m_dstMemNative + (castVec(idx) * dstPitchBytesWithoutOutmost).sum(), this->m_byte, static_cast(this->m_extentWidthBytes)); }); diff --git a/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp b/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp index 4841a568e1e5..e1b7a78c6b67 100644 --- a/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp +++ b/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp @@ -202,8 +202,8 @@ namespace alpaka , m_dstHeight(static_cast(getHeight(viewDst))) , m_srcHeight(static_cast(getHeight(viewSrc))) # endif - , m_dstPitchBytes(static_cast(getPitchesInBytes(viewDst)[Dim::value - 1u])) - , m_srcPitchBytes(static_cast(getPitchesInBytes(viewSrc)[Dim::value - 1u])) + , m_dstRowPitchBytes(static_cast(getPitchesInBytes(viewDst)[0])) + , m_srcRowPitchBytes(static_cast(getPitchesInBytes(viewSrc)[0])) , m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) { @@ -212,8 +212,8 @@ namespace alpaka ALPAKA_ASSERT(m_extentHeight <= m_dstHeight); ALPAKA_ASSERT(m_extentWidth <= m_srcWidth); ALPAKA_ASSERT(m_extentHeight <= m_srcHeight); - ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes); - ALPAKA_ASSERT(m_extentWidthBytes <= m_srcPitchBytes); + ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes); + ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes); # endif } @@ -236,9 +236,9 @@ namespace alpaka // Initiate the memory copy. ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpy2DAsync( m_dstMemNative, - m_dstPitchBytes, + m_dstRowPitchBytes, m_srcMemNative, - m_srcPitchBytes, + m_srcRowPitchBytes, m_extentWidthBytes, static_cast(m_extentHeight), m_uniformMemCpyKind, @@ -251,9 +251,10 @@ namespace alpaka { std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice << " dw: " << m_dstWidth - << " dh: " << m_dstHeight << " dptr: " << m_dstMemNative << " dpitchb: " << m_dstPitchBytes - << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight - << " sptr: " << m_srcMemNative << " spitchb: " << m_srcPitchBytes << std::endl; + << " dh: " << m_dstHeight << " dptr: " << m_dstMemNative + << " dpitchb: " << m_dstRowPitchBytes << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth + << " sh: " << m_srcHeight << " sptr: " << m_srcMemNative + << " spitchb: " << m_srcRowPitchBytes << std::endl; } # endif @@ -272,8 +273,8 @@ namespace alpaka Idx m_dstHeight; Idx m_srcHeight; # endif - std::size_t m_dstPitchBytes; - std::size_t m_srcPitchBytes; + std::size_t m_dstRowPitchBytes; + std::size_t m_srcRowPitchBytes; void* m_dstMemNative; void const* m_srcMemNative; @@ -308,12 +309,10 @@ namespace alpaka , m_dstDepth(static_cast(getDepth(viewDst))) , m_srcDepth(static_cast(getDepth(viewSrc))) # endif - , m_dstPitchBytesX(static_cast(getPitchesInBytes(viewDst)[Dim::value - 1u])) - , m_srcPitchBytesX(static_cast(getPitchesInBytes(viewSrc)[Dim::value - 1u])) - , m_dstPitchBytesY(static_cast( - getPitchesInBytes(viewDst)[Dim::value - (2u % Dim::value)])) - , m_srcPitchBytesY(static_cast( - getPitchesInBytes(viewSrc)[Dim::value - (2u % Dim::value)])) + , m_dstRowPitchBytes(static_cast(getPitchesInBytes(viewDst)[1])) + , m_srcRowPitchBytes(static_cast(getPitchesInBytes(viewSrc)[1])) + , m_dstSlicePitchBytes(static_cast(getPitchesInBytes(viewDst)[0])) + , m_srcSlicePitchBytes(static_cast(getPitchesInBytes(viewSrc)[0])) , m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) { @@ -324,8 +323,8 @@ namespace alpaka ALPAKA_ASSERT(m_extentWidth <= m_srcWidth); ALPAKA_ASSERT(m_extentHeight <= m_srcHeight); ALPAKA_ASSERT(m_extentDepth <= m_srcDepth); - ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes); - ALPAKA_ASSERT(m_extentWidthBytes <= m_srcPitchBytes); + ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes); + ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes); # endif } @@ -364,16 +363,16 @@ namespace alpaka memCpy3DParms.srcPos = TApi::makePos(0, 0, 0); // Optional. Offset in bytes. memCpy3DParms.srcPtr = TApi::makePitchedPtr( const_cast(m_srcMemNative), - m_srcPitchBytesX, + m_srcRowPitchBytes, static_cast(m_srcWidth), - static_cast(m_srcPitchBytesY / m_srcPitchBytesX)); + static_cast(m_srcSlicePitchBytes / m_srcRowPitchBytes)); memCpy3DParms.dstArray = nullptr; // Either dstArray or dstPtr. memCpy3DParms.dstPos = TApi::makePos(0, 0, 0); // Optional. Offset in bytes. memCpy3DParms.dstPtr = TApi::makePitchedPtr( m_dstMemNative, - m_dstPitchBytesX, + m_dstRowPitchBytes, static_cast(m_dstWidth), - static_cast(m_dstPitchBytesY / m_dstPitchBytesX)); + static_cast(m_dstSlicePitchBytes / m_dstRowPitchBytes)); memCpy3DParms.extent = TApi::makeExtent( m_extentWidthBytes, static_cast(m_extentHeight), @@ -388,9 +387,9 @@ namespace alpaka std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight << " ed: " << m_extentDepth << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice << " dw: " << m_dstWidth << " dh: " << m_dstHeight << " dd: " << m_dstDepth - << " dptr: " << m_dstMemNative << " dpitchb: " << m_dstPitchBytes + << " dptr: " << m_dstMemNative << " dpitchb: " << m_dstRowPitchBytes << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight - << " sd: " << m_srcDepth << " sptr: " << m_srcMemNative << " spitchb: " << m_srcPitchBytes + << " sd: " << m_srcDepth << " sptr: " << m_srcMemNative << " spitchb: " << m_srcRowPitchBytes << std::endl; } # endif @@ -411,10 +410,10 @@ namespace alpaka Idx m_dstDepth; Idx m_srcDepth; # endif - std::size_t m_dstPitchBytesX; - std::size_t m_srcPitchBytesX; - std::size_t m_dstPitchBytesY; - std::size_t m_srcPitchBytesY; + std::size_t m_dstRowPitchBytes; + std::size_t m_srcRowPitchBytes; + std::size_t m_dstSlicePitchBytes; + std::size_t m_srcSlicePitchBytes; void* m_dstMemNative; void const* m_srcMemNative; diff --git a/include/alpaka/mem/view/Traits.hpp b/include/alpaka/mem/view/Traits.hpp index 47ee11785858..8d3e158f3467 100644 --- a/include/alpaka/mem/view/Traits.hpp +++ b/include/alpaka/mem/view/Traits.hpp @@ -34,13 +34,13 @@ namespace alpaka ALPAKA_FN_HOST_ACC constexpr inline auto calculatePitchesFromExtents(Vec const& extent) { Vec pitchBytes{}; - if constexpr(TDim::value > 0) + constexpr auto dim = TIdx{TDim::value}; + if constexpr(dim > 0) + pitchBytes.back() = static_cast(sizeof(TElem)); + if constexpr(dim > 1) { - pitchBytes[TDim::value - 1u] = extent[TDim::value - 1u] * static_cast(sizeof(TElem)); - for(TIdx i = TDim::value - 1u; i > static_cast(0u); --i) - { - pitchBytes[i - 1] = extent[i - 1] * pitchBytes[i]; - } + for(TIdx i = 1; i < dim; i++) + pitchBytes[dim - 1 - i] = extent[dim - i] * pitchBytes[dim - i]; } return pitchBytes; } @@ -194,6 +194,11 @@ namespace alpaka //! \return The pitches in bytes. This is the distance in bytes between two consecutive elements in the given //! dimension. + //! E.g. for a 3D view without padding, the 0-dim pitch is the distance in bytes to jump from one element to the + //! next within the same row, the 1-dim pitch (aka. the row pitch) is the distance in bytes to jump from one + //! element to the neighboring element on the next row. The 2-dim pitch (aka. the slice pitch) is the distance in + //! bytes to jump from one element to the neighboring element on the next slice. + //! E.g. a 3D view of floats without padding and the extents {42, 10, 2}, would have a pitch vector of {80, 8, 4}. template ALPAKA_FN_HOST auto getPitchesInBytes(TView const& view) -> Vec, Idx> { @@ -567,15 +572,6 @@ namespace alpaka auto const ex = getExtents(view); return std::experimental::dextents, Dim::value>{ex[Is]...}; } - - template - ALPAKA_FN_HOST auto makeStrides(Vec const& pitches, std::index_sequence) - { - // alpaka pitches are right-shifted by 1. We skip getPitchBytes<0> (the size in bytes of the entire - // buffer) and append the element size last - return std::array{ - (Is < TDim::value - 1 ? pitches[Is + 1] : static_cast(sizeof(TElem)))...}; - } } // namespace detail //! Customization point for getting an mdspan from a view. @@ -588,8 +584,7 @@ namespace alpaka using Element = Elem; auto extents = detail::makeExtents(view, std::make_index_sequence{}); auto* ptr = reinterpret_cast(getPtrNative(view)); - auto const strides - = detail::makeStrides(getPitchesInBytes(view), std::make_index_sequence{}); + auto const strides = toArray(getPitchesInBytes(view)); layout_stride::mapping m{extents, strides}; return mdspan>{ ptr, @@ -602,8 +597,7 @@ namespace alpaka using Element = Elem; auto extents = detail::makeExtents(view, std::make_index_sequence{}); auto* ptr = reinterpret_cast(getPtrNative(view)); - auto strides - = detail::makeStrides(getPitchesInBytes(view), std::make_index_sequence{}); + auto strides = toArray(getPitchesInBytes(view)); std::reverse(begin(strides), end(strides)); layout_stride::mapping m{extents, strides}; return mdspan>{ diff --git a/include/alpaka/mem/view/ViewSubView.hpp b/include/alpaka/mem/view/ViewSubView.hpp index ef784cf58622..7c4831d9f980 100644 --- a/include/alpaka/mem/view/ViewSubView.hpp +++ b/include/alpaka/mem/view/ViewSubView.hpp @@ -42,6 +42,7 @@ namespace alpaka : m_viewParentView(getPtrNative(view), getDev(view), getExtents(view), getPitchesInBytes(view)) , m_extentElements(getExtents(extentElements)) , m_offsetsElements(getOffsetVec(relativeOffsetsElements)) + , m_nativePtr(computeNativePtr()) { ALPAKA_DEBUG_FULL_LOG_SCOPE; @@ -81,6 +82,7 @@ namespace alpaka : m_viewParentView(getPtrNative(view), getDev(view), getExtents(view), getPitchesInBytes(view)) , m_extentElements(getExtents(extentElements)) , m_offsetsElements(getOffsetVec(relativeOffsetsElements)) + , m_nativePtr(computeNativePtr()) { ALPAKA_DEBUG_FULL_LOG_SCOPE; @@ -127,9 +129,32 @@ namespace alpaka } public: + ALPAKA_FN_HOST auto computeNativePtr() + { +#if BOOST_COMP_GNUC +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored \ + "-Wcast-align" // "cast from 'std::uint8_t*' to 'TElem*' increases required alignment of target type" +#endif + return reinterpret_cast( + reinterpret_cast(alpaka::getPtrNative(m_viewParentView)) + + pitchedOffsetBytes(std::make_index_sequence{})); +#if BOOST_COMP_GNUC +# pragma GCC diagnostic pop +#endif + } + + template + ALPAKA_FN_HOST auto pitchedOffsetBytes(std::index_sequence const&) -> TIdx + { + auto const pitches = getPitchesInBytes(m_viewParentView); + return (m_offsetsElements * pitches).sum(); + } + ViewPlainPtr m_viewParentView; // This wraps the parent view. Vec m_extentElements; // The extent of this view. Vec m_offsetsElements; // The offset relative to the parent view. + TElem* m_nativePtr; }; // Trait specializations for ViewSubView. @@ -176,57 +201,19 @@ namespace alpaka } }; -#if BOOST_COMP_GNUC -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored \ - "-Wcast-align" // "cast from 'std::uint8_t*' to 'TElem*' increases required alignment of target type" -#endif //! The ViewSubView native pointer get trait specialization. template struct GetPtrNative> { - private: - using IdxSequence = std::make_index_sequence; - - public: ALPAKA_FN_HOST static auto getPtrNative(ViewSubView const& view) -> TElem const* { - // \TODO: pre-calculate this pointer for faster execution. - return reinterpret_cast( - reinterpret_cast(alpaka::getPtrNative(view.m_viewParentView)) - + pitchedOffsetBytes(view, IdxSequence())); + return view.m_nativePtr; } ALPAKA_FN_HOST static auto getPtrNative(ViewSubView& view) -> TElem* { - // \TODO: pre-calculate this pointer for faster execution. - return reinterpret_cast( - reinterpret_cast(alpaka::getPtrNative(view.m_viewParentView)) - + pitchedOffsetBytes(view, IdxSequence())); - } - - private: - //! For a 3D vector this calculates: - //! - //! getOffsets(view)[0] * getPitchBytes<1u>(view) - //! + getOffsets(view)[1] * getPitchBytes<2u>(view) - //! + getOffsets(view)[2] * getPitchBytes<3u>(view) - //! while getPitchBytes<3u>(view) is equivalent to sizeof(TElem) - template - ALPAKA_FN_HOST static auto pitchedOffsetBytes(TView const& view, std::index_sequence const&) - -> TIdx - { - auto const offsets = getOffsets(view); - auto const pitches = getPitchesInBytes(view); - return ( - (offsets[TIndices] - * (TIndices + 1 < Dim::value ? pitches[TIndices + 1] - : static_cast(sizeof(Elem)))) - + ... + TIdx{0}); // FIXME: see comment above + return view.m_nativePtr; } }; -#if BOOST_COMP_GNUC -# pragma GCC diagnostic pop -#endif //! The ViewSubView pitch get trait specialization. template @@ -234,7 +221,7 @@ namespace alpaka { ALPAKA_FN_HOST auto operator()(ViewSubView const& view) const { - return alpaka::getPitchesInBytes(view.m_viewParentView); + return getPitchesInBytes(view.m_viewParentView); } }; diff --git a/include/alpaka/test/mem/view/Iterator.hpp b/include/alpaka/test/mem/view/Iterator.hpp index 42295ff0de94..33fb481b0a80 100644 --- a/include/alpaka/test/mem/view/Iterator.hpp +++ b/include/alpaka/test/mem/view/Iterator.hpp @@ -17,11 +17,6 @@ namespace alpaka::test template using MimicConst = std::conditional_t, std::add_const_t, std::remove_const_t>; -#if BOOST_COMP_GNUC -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored \ - "-Wcast-align" // "cast from 'Byte*' to 'Elem*' increases required alignment of target type" -#endif template class IteratorView { @@ -39,7 +34,7 @@ namespace alpaka::test { } - ALPAKA_FN_HOST IteratorView(TView& view) : IteratorView(view, 0) + ALPAKA_FN_HOST explicit IteratorView(TView& view) : IteratorView(view, 0) { } @@ -87,50 +82,27 @@ namespace alpaka::test return *m_nativePtr; else { - using Dim1 = DimInt<1>; - using DimMin1 = DimInt; - - Vec const currentIdxDim1{m_currentIdx}; - Vec const currentIdxDimx(mapIdx(currentIdxDim1, m_extents)); - - // [pz, py, px] -> [py, px] - auto const pitchWithoutOutermost = subVecEnd(m_pitchBytes); - // [ElemSize] - Vec const elementSizeVec = static_cast(sizeof(Elem)); - // [py, px] ++ [ElemSize] -> [py, px, ElemSize] - Vec const dstPitchBytes = concatVec(pitchWithoutOutermost, elementSizeVec); - // [py, px, ElemSize] [z, y, x] -> [py*z, px*y, ElemSize*x] - auto const dimensionalOffsetsInByte = currentIdxDimx * dstPitchBytes; - // sum{[py*z, px*y, ElemSize*x]} -> offset in byte - auto const offsetInByte = dimensionalOffsetsInByte.foldrAll(std::plus()); - - using Byte = MimicConst; - Byte* ptr(reinterpret_cast(m_nativePtr) + offsetInByte); - -#if 0 - std::cout - << " i1: " << currentIdxDim1 - << " in: " << currentIdxDimx - << " dpb: " << dstPitchBytes - << " offb: " << offsetInByte - << " ptr: " << reinterpret_cast(ptr) - << " v: " << *reinterpret_cast(ptr) - << std::endl; + Vec const currentIdxDimx = mapIdx(Vec{m_currentIdx}, m_extents); + auto const offsetInBytes = (currentIdxDimx * m_pitchBytes).sum(); + using QualifiedByte = MimicConst; +#if BOOST_COMP_GNUC +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored \ + "-Wcast-align" // "cast from 'Byte*' to 'Elem*' increases required alignment of target type" +#endif + return *reinterpret_cast(reinterpret_cast(m_nativePtr) + offsetInBytes); +#if BOOST_COMP_GNUC +# pragma GCC diagnostic pop #endif - return *reinterpret_cast(ptr); } ALPAKA_UNREACHABLE(*m_nativePtr); } - private: Elem* m_nativePtr; Idx m_currentIdx; Vec m_extents; Vec m_pitchBytes; }; -#if BOOST_COMP_GNUC -# pragma GCC diagnostic pop -#endif template struct Begin diff --git a/include/alpaka/test/mem/view/ViewTest.hpp b/include/alpaka/test/mem/view/ViewTest.hpp index e0a534fc6a7c..69992fc92229 100644 --- a/include/alpaka/test/mem/view/ViewTest.hpp +++ b/include/alpaka/test/mem/view/ViewTest.hpp @@ -57,16 +57,7 @@ namespace alpaka::test // trait::GetPitchBytes { - // The pitches have to be at least as large as the values we calculate here. - auto pitchMinimum = Vec, TIdx>::ones(); - // Initialize the pitch between two elements of the X dimension ... - pitchMinimum[TDim::value] = sizeof(TElem); - // ... and fill all the other dimensions. - for(TIdx i = TDim::value; i > static_cast(0u); --i) - { - pitchMinimum[i - 1] = extent[i - 1] * pitchMinimum[i]; - } - + auto const pitchMinimum = alpaka::detail::calculatePitchesFromExtents(extent); auto const pitchView = getPitchesInBytes(view); for(TIdx i = TDim::value; i > static_cast(0u); --i) diff --git a/test/integ/mandelbrot/src/mandelbrot.cpp b/test/integ/mandelbrot/src/mandelbrot.cpp index a9812fbcf367..ba2385c231f3 100644 --- a/test/integ/mandelbrot/src/mandelbrot.cpp +++ b/test/integ/mandelbrot/src/mandelbrot.cpp @@ -203,7 +203,7 @@ auto writeTgaColorImage(std::string const& fileName, TBuf const& bufRgba) -> voi ALPAKA_ASSERT(bufWidthColors >= 1); auto const bufHeightColors = alpaka::getHeight(bufRgba); ALPAKA_ASSERT(bufHeightColors >= 1); - auto const bufPitchBytes = alpaka::getPitchesInBytes(bufRgba)[alpaka::Dim::value - 1u]; + auto const bufPitchBytes = alpaka::getPitchesInBytes(bufRgba)[0]; ALPAKA_ASSERT(bufPitchBytes >= bufWidthBytes); std::ofstream ofs(fileName, std::ofstream::out | std::ofstream::binary); @@ -321,7 +321,7 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs) alpaka::getPtrNative(bufColorAcc), numRows, numCols, - alpaka::getPitchesInBytes(bufColorAcc)[1], + alpaka::getPitchesInBytes(bufColorAcc)[0], fMinR, fMaxR, fMinI, diff --git a/test/integ/matMul/src/matMul.cpp b/test/integ/matMul/src/matMul.cpp index ac252a1563af..85e7dd7605b6 100644 --- a/test/integ/matMul/src/matMul.cpp +++ b/test/integ/matMul/src/matMul.cpp @@ -83,7 +83,7 @@ class MatMulKernel for(TIndex k2(0u); k2 < blockMulCount; ++k2) { // Copy the current blocks of A and B into shared memory in parallel. - // If the element of the current thread is outside of the matrix, zero is written into the shared memory. + // If the element of the current thread is outside the matrix, zero is written into the shared memory. // This is possible because zero is a result neutral extension of the matrices regarding the dot product. auto const AIdxX = k2 * blockThreadExtentX + blockThreadIdxX; auto const AIdx1d = gridThreadIdxY * lda + AIdxX; @@ -242,11 +242,12 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) alpaka::memcpy(queueAcc, bufCAcc, bufCHost); - auto const pitchA = alpaka::getPitchesInBytes(bufAAcc)[1]; - auto const pitchB = alpaka::getPitchesInBytes(bufBAcc)[1]; - auto const pitchC = alpaka::getPitchesInBytes(bufCAcc)[1]; + auto const pitchA = alpaka::getPitchesInBytes(bufAAcc)[0]; + auto const pitchB = alpaka::getPitchesInBytes(bufBAcc)[0]; + auto const pitchC = alpaka::getPitchesInBytes(bufCAcc)[0]; + std::cout << "pitchA " << alpaka::getPitchesInBytes(bufAAcc) << "\n"; - // Assumptions we make + // We assume that the row pitches are divisible by the element size REQUIRE(pitchA % sizeof(Val) == 0); REQUIRE(pitchB % sizeof(Val) == 0); REQUIRE(pitchC % sizeof(Val) == 0); diff --git a/test/unit/mem/view/src/ViewSubViewTest.cpp b/test/unit/mem/view/src/ViewSubViewTest.cpp index ad6107415902..9a1d071291e3 100644 --- a/test/unit/mem/view/src/ViewSubViewTest.cpp +++ b/test/unit/mem/view/src/ViewSubViewTest.cpp @@ -35,14 +35,10 @@ namespace alpaka::test // alpaka::trait::GetPitchesInBytes // The pitch of the view has to be identical to the pitch of the underlying buffer in all dimensions. + auto const pitchBuf = alpaka::getPitchesInBytes(buf); { - auto const pitchBuf = alpaka::getPitchesInBytes(buf); auto const pitchView = alpaka::getPitchesInBytes(view); - - for(TIdx i = TDim::value; i > static_cast(0u); --i) - { - REQUIRE(pitchBuf[i - static_cast(1u)] == pitchView[i - static_cast(1u)]); - } + CHECK(pitchBuf == pitchView); } // alpaka::trait::GetPtrNative @@ -50,15 +46,7 @@ namespace alpaka::test { auto viewPtrNative = reinterpret_cast(alpaka::getPtrNative(buf)); if constexpr(TDim::value > 0) - { - auto const pitchBuf = alpaka::getPitchesInBytes(buf); - for(TIdx i = TDim::value; i > static_cast(0u); --i) - { - auto const pitch - = (i < static_cast(TDim::value)) ? pitchBuf[i] : static_cast(sizeof(TElem)); - viewPtrNative += offsetView[i - static_cast(1u)] * pitch; - } - } + viewPtrNative += (offsetView * pitchBuf).sum(); REQUIRE(reinterpret_cast(viewPtrNative) == alpaka::getPtrNative(view)); } } @@ -159,3 +147,10 @@ TEMPLATE_LIST_TEST_CASE("viewSubViewOffsetConstTest", "[memView]", alpaka::test: { alpaka::test::testViewSubViewOffsetConst(); } + +TEST_CASE("calculatePitchesFromExtents", "[memView]") +{ + CHECK((alpaka::detail::calculatePitchesFromExtents(alpaka::Vec{1, 1, 1}) == alpaka::Vec{4, 4, 4})); + CHECK((alpaka::detail::calculatePitchesFromExtents(alpaka::Vec{2, 2, 2}) == alpaka::Vec{16, 8, 4})); + CHECK((alpaka::detail::calculatePitchesFromExtents(alpaka::Vec{42, 10, 2}) == alpaka::Vec{80, 8, 4})); +}